* coding.c (QCcategory): New variable.
[bpt/emacs.git] / src / coding.c
1 /* Coding system handler (conversion, detection, etc).
2 Copyright (C) 1995, 1997, 1998 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /*** TABLE OF CONTENTS ***
27
28 0. General comments
29 1. Preamble
30 2. Emacs' internal format (emacs-utf-8) handlers
31 3. UTF-8 handlers
32 4. UTF-16 handlers
33 5. Charset-base coding systems handlers
34 6. emacs-mule (old Emacs' internal format) handlers
35 7. ISO2022 handlers
36 8. Shift-JIS and BIG5 handlers
37 9. CCL handlers
38 10. C library functions
39 11. Emacs Lisp library functions
40 12. Postamble
41
42 */
43
44 /*** 0. General comments ***
45
46
47 CODING SYSTEM
48
49 A coding system is an object for an encoding mechanism that contains
50 information about how to convert byte sequences to character
51 sequences and vice versa. When we say "decode", it means converting
52 a byte sequence of a specific coding system into a character
53 sequence that is represented by Emacs' internal coding system
54 `emacs-utf-8', and when we say "encode", it means converting a
55 character sequence of emacs-utf-8 to a byte sequence of a specific
56 coding system.
57
58 In Emacs Lisp, a coding system is represented by a Lisp symbol. In
59 C level, a coding system is represented by a vector of attributes
60 stored in the hash table Vcharset_hash_table. The conversion from
61 coding system symbol to attributes vector is done by looking up
62 Vcharset_hash_table by the symbol.
63
64 Coding systems are classified into the following types depending on
65 the encoding mechanism. Here's a brief description of the types.
66
67 o UTF-8
68
69 o UTF-16
70
71 o Charset-base coding system
72
73 A coding system defined by one or more (coded) character sets.
74 Decoding and encoding are done by a code converter defined for each
75 character set.
76
77 o Old Emacs internal format (emacs-mule)
78
79 The coding system adopted by old versions of Emacs (20 and 21).
80
81 o ISO2022-base coding system
82
83 The most famous coding system for multiple character sets. X's
84 Compound Text, various EUCs (Extended Unix Code), and coding systems
85 used in the Internet communication such as ISO-2022-JP are all
86 variants of ISO2022.
87
88 o SJIS (or Shift-JIS or MS-Kanji-Code)
89
90 A coding system to encode character sets: ASCII, JISX0201, and
91 JISX0208. Widely used for PC's in Japan. Details are described in
92 section 8.
93
94 o BIG5
95
96 A coding system to encode character sets: ASCII and Big5. Widely
97 used for Chinese (mainly in Taiwan and Hong Kong). Details are
98 described in section 8. In this file, when we write "big5" (all
99 lowercase), we mean the coding system, and when we write "Big5"
100 (capitalized), we mean the character set.
101
102 o CCL
103
104 If a user wants to decode/encode text encoded in a coding system
105 not listed above, he can supply a decoder and an encoder for it in
106 CCL (Code Conversion Language) programs. Emacs executes the CCL
107 program while decoding/encoding.
108
109 o Raw-text
110
111 A coding system for text containing raw eight-bit data. Emacs
112 treats each byte of source text as a character (except for
113 end-of-line conversion).
114
115 o No-conversion
116
117 Like raw text, but don't do end-of-line conversion.
118
119
120 END-OF-LINE FORMAT
121
122 How text end-of-line is encoded depends on operating system. For
123 instance, Unix's format is just one byte of LF (line-feed) code,
124 whereas DOS's format is two-byte sequence of `carriage-return' and
125 `line-feed' codes. MacOS's format is usually one byte of
126 `carriage-return'.
127
128 Since text character encoding and end-of-line encoding are
129 independent, any coding system described above can take any format
130 of end-of-line (except for no-conversion).
131
132 STRUCT CODING_SYSTEM
133
134 Before using a coding system for code conversion (i.e. decoding and
135 encoding), we setup a structure of type `struct coding_system'.
136 This structure keeps various information about a specific code
137 conversion (e.g. the location of source and destination data).
138
139 */
140
141 /* COMMON MACROS */
142
143
144 /*** GENERAL NOTES on `detect_coding_XXX ()' functions ***
145
146 These functions check if a byte sequence specified as a source in
147 CODING conforms to the format of XXX, and update the members of
148 DETECT_INFO.
149
150 Return 1 if the byte sequence conforms to XXX, otherwise return 0.
151
152 Below is the template of these functions. */
153
154 #if 0
155 static int
156 detect_coding_XXX (coding, detect_info)
157 struct coding_system *coding;
158 struct coding_detection_info *detect_info;
159 {
160 unsigned char *src = coding->source;
161 unsigned char *src_end = coding->source + coding->src_bytes;
162 int multibytep = coding->src_multibyte;
163 int consumed_chars = 0;
164 int found = 0;
165 ...;
166
167 while (1)
168 {
169 /* Get one byte from the source. If the souce is exausted, jump
170 to no_more_source:. */
171 ONE_MORE_BYTE (c);
172
173 if (! __C_conforms_to_XXX___ (c))
174 break;
175 if (! __C_strongly_suggests_XXX__ (c))
176 found = CATEGORY_MASK_XXX;
177 }
178 /* The byte sequence is invalid for XXX. */
179 detect_info->rejected |= CATEGORY_MASK_XXX;
180 return 0;
181
182 no_more_source:
183 /* The source exausted successfully. */
184 detect_info->found |= found;
185 return 1;
186 }
187 #endif
188
189 /*** GENERAL NOTES on `decode_coding_XXX ()' functions ***
190
191 These functions decode a byte sequence specified as a source by
192 CODING. The resulting multibyte text goes to a place pointed to by
193 CODING->charbuf, the length of which should not exceed
194 CODING->charbuf_size;
195
196 These functions set the information of original and decoded texts in
197 CODING->consumed, CODING->consumed_char, and CODING->charbuf_used.
198 They also set CODING->result to one of CODING_RESULT_XXX indicating
199 how the decoding is finished.
200
201 Below is the template of these functions. */
202
203 #if 0
204 static void
205 decode_coding_XXXX (coding)
206 struct coding_system *coding;
207 {
208 unsigned char *src = coding->source + coding->consumed;
209 unsigned char *src_end = coding->source + coding->src_bytes;
210 /* SRC_BASE remembers the start position in source in each loop.
211 The loop will be exited when there's not enough source code, or
212 when there's no room in CHARBUF for a decoded character. */
213 unsigned char *src_base;
214 /* A buffer to produce decoded characters. */
215 int *charbuf = coding->charbuf;
216 int *charbuf_end = charbuf + coding->charbuf_size;
217 int multibytep = coding->src_multibyte;
218
219 while (1)
220 {
221 src_base = src;
222 if (charbuf < charbuf_end)
223 /* No more room to produce a decoded character. */
224 break;
225 ONE_MORE_BYTE (c);
226 /* Decode it. */
227 }
228
229 no_more_source:
230 if (src_base < src_end
231 && coding->mode & CODING_MODE_LAST_BLOCK)
232 /* If the source ends by partial bytes to construct a character,
233 treat them as eight-bit raw data. */
234 while (src_base < src_end && charbuf < charbuf_end)
235 *charbuf++ = *src_base++;
236 /* Remember how many bytes and characters we consumed. If the
237 source is multibyte, the bytes and chars are not identical. */
238 coding->consumed = coding->consumed_char = src_base - coding->source;
239 /* Remember how many characters we produced. */
240 coding->charbuf_used = charbuf - coding->charbuf;
241 }
242 #endif
243
244 /*** GENERAL NOTES on `encode_coding_XXX ()' functions ***
245
246 These functions encode SRC_BYTES length text at SOURCE of Emacs'
247 internal multibyte format by CODING. The resulting byte sequence
248 goes to a place pointed to by DESTINATION, the length of which
249 should not exceed DST_BYTES.
250
251 These functions set the information of original and encoded texts in
252 the members produced, produced_char, consumed, and consumed_char of
253 the structure *CODING. They also set the member result to one of
254 CODING_RESULT_XXX indicating how the encoding finished.
255
256 DST_BYTES zero means that source area and destination area are
257 overlapped, which means that we can produce a encoded text until it
258 reaches at the head of not-yet-encoded source text.
259
260 Below is a template of these functions. */
261 #if 0
262 static void
263 encode_coding_XXX (coding)
264 struct coding_system *coding;
265 {
266 int multibytep = coding->dst_multibyte;
267 int *charbuf = coding->charbuf;
268 int *charbuf_end = charbuf->charbuf + coding->charbuf_used;
269 unsigned char *dst = coding->destination + coding->produced;
270 unsigned char *dst_end = coding->destination + coding->dst_bytes;
271 unsigned char *adjusted_dst_end = dst_end - _MAX_BYTES_PRODUCED_IN_LOOP_;
272 int produced_chars = 0;
273
274 for (; charbuf < charbuf_end && dst < adjusted_dst_end; charbuf++)
275 {
276 int c = *charbuf;
277 /* Encode C into DST, and increment DST. */
278 }
279 label_no_more_destination:
280 /* How many chars and bytes we produced. */
281 coding->produced_char += produced_chars;
282 coding->produced = dst - coding->destination;
283 }
284 #endif
285
286 \f
287 /*** 1. Preamble ***/
288
289 #include <config.h>
290 #include <stdio.h>
291
292 #include "lisp.h"
293 #include "buffer.h"
294 #include "character.h"
295 #include "charset.h"
296 #include "ccl.h"
297 #include "composite.h"
298 #include "coding.h"
299 #include "window.h"
300
301 Lisp_Object Vcoding_system_hash_table;
302
303 Lisp_Object Qcoding_system, Qcoding_aliases, Qeol_type;
304 Lisp_Object Qunix, Qdos;
305 extern Lisp_Object Qmac; /* frame.c */
306 Lisp_Object Qbuffer_file_coding_system;
307 Lisp_Object Qpost_read_conversion, Qpre_write_conversion;
308 Lisp_Object Qdefault_char;
309 Lisp_Object Qno_conversion, Qundecided;
310 Lisp_Object Qcharset, Qiso_2022, Qutf_8, Qutf_16, Qshift_jis, Qbig5;
311 Lisp_Object Qbig, Qlittle;
312 Lisp_Object Qcoding_system_history;
313 Lisp_Object Qvalid_codes;
314 Lisp_Object QCcategory;
315
316 extern Lisp_Object Qinsert_file_contents, Qwrite_region;
317 Lisp_Object Qcall_process, Qcall_process_region, Qprocess_argument;
318 Lisp_Object Qstart_process, Qopen_network_stream;
319 Lisp_Object Qtarget_idx;
320
321 Lisp_Object Vselect_safe_coding_system_function;
322
323 /* Mnemonic string for each format of end-of-line. */
324 Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac;
325 /* Mnemonic string to indicate format of end-of-line is not yet
326 decided. */
327 Lisp_Object eol_mnemonic_undecided;
328
329 #ifdef emacs
330
331 Lisp_Object Vcoding_system_list, Vcoding_system_alist;
332
333 Lisp_Object Qcoding_system_p, Qcoding_system_error;
334
335 /* Coding system emacs-mule and raw-text are for converting only
336 end-of-line format. */
337 Lisp_Object Qemacs_mule, Qraw_text;
338
339 /* Coding-systems are handed between Emacs Lisp programs and C internal
340 routines by the following three variables. */
341 /* Coding-system for reading files and receiving data from process. */
342 Lisp_Object Vcoding_system_for_read;
343 /* Coding-system for writing files and sending data to process. */
344 Lisp_Object Vcoding_system_for_write;
345 /* Coding-system actually used in the latest I/O. */
346 Lisp_Object Vlast_coding_system_used;
347
348 /* A vector of length 256 which contains information about special
349 Latin codes (especially for dealing with Microsoft codes). */
350 Lisp_Object Vlatin_extra_code_table;
351
352 /* Flag to inhibit code conversion of end-of-line format. */
353 int inhibit_eol_conversion;
354
355 /* Flag to inhibit ISO2022 escape sequence detection. */
356 int inhibit_iso_escape_detection;
357
358 /* Flag to make buffer-file-coding-system inherit from process-coding. */
359 int inherit_process_coding_system;
360
361 /* Coding system to be used to encode text for terminal display. */
362 struct coding_system terminal_coding;
363
364 /* Coding system to be used to encode text for terminal display when
365 terminal coding system is nil. */
366 struct coding_system safe_terminal_coding;
367
368 /* Coding system of what is sent from terminal keyboard. */
369 struct coding_system keyboard_coding;
370
371 Lisp_Object Vfile_coding_system_alist;
372 Lisp_Object Vprocess_coding_system_alist;
373 Lisp_Object Vnetwork_coding_system_alist;
374
375 Lisp_Object Vlocale_coding_system;
376
377 #endif /* emacs */
378
379 /* Flag to tell if we look up translation table on character code
380 conversion. */
381 Lisp_Object Venable_character_translation;
382 /* Standard translation table to look up on decoding (reading). */
383 Lisp_Object Vstandard_translation_table_for_decode;
384 /* Standard translation table to look up on encoding (writing). */
385 Lisp_Object Vstandard_translation_table_for_encode;
386
387 Lisp_Object Qtranslation_table;
388 Lisp_Object Qtranslation_table_id;
389 Lisp_Object Qtranslation_table_for_decode;
390 Lisp_Object Qtranslation_table_for_encode;
391
392 /* Alist of charsets vs revision number. */
393 static Lisp_Object Vcharset_revision_table;
394
395 /* Default coding systems used for process I/O. */
396 Lisp_Object Vdefault_process_coding_system;
397
398 /* Global flag to tell that we can't call post-read-conversion and
399 pre-write-conversion functions. Usually the value is zero, but it
400 is set to 1 temporarily while such functions are running. This is
401 to avoid infinite recursive call. */
402 static int inhibit_pre_post_conversion;
403
404 /* Two special coding systems. */
405 Lisp_Object Vsjis_coding_system;
406 Lisp_Object Vbig5_coding_system;
407
408
409 static int detect_coding_utf_8 P_ ((struct coding_system *,
410 struct coding_detection_info *info));
411 static void decode_coding_utf_8 P_ ((struct coding_system *));
412 static int encode_coding_utf_8 P_ ((struct coding_system *));
413
414 static int detect_coding_utf_16 P_ ((struct coding_system *,
415 struct coding_detection_info *info));
416 static void decode_coding_utf_16 P_ ((struct coding_system *));
417 static int encode_coding_utf_16 P_ ((struct coding_system *));
418
419 static int detect_coding_iso_2022 P_ ((struct coding_system *,
420 struct coding_detection_info *info));
421 static void decode_coding_iso_2022 P_ ((struct coding_system *));
422 static int encode_coding_iso_2022 P_ ((struct coding_system *));
423
424 static int detect_coding_emacs_mule P_ ((struct coding_system *,
425 struct coding_detection_info *info));
426 static void decode_coding_emacs_mule P_ ((struct coding_system *));
427 static int encode_coding_emacs_mule P_ ((struct coding_system *));
428
429 static int detect_coding_sjis P_ ((struct coding_system *,
430 struct coding_detection_info *info));
431 static void decode_coding_sjis P_ ((struct coding_system *));
432 static int encode_coding_sjis P_ ((struct coding_system *));
433
434 static int detect_coding_big5 P_ ((struct coding_system *,
435 struct coding_detection_info *info));
436 static void decode_coding_big5 P_ ((struct coding_system *));
437 static int encode_coding_big5 P_ ((struct coding_system *));
438
439 static int detect_coding_ccl P_ ((struct coding_system *,
440 struct coding_detection_info *info));
441 static void decode_coding_ccl P_ ((struct coding_system *));
442 static int encode_coding_ccl P_ ((struct coding_system *));
443
444 static void decode_coding_raw_text P_ ((struct coding_system *));
445 static int encode_coding_raw_text P_ ((struct coding_system *));
446
447
448 /* ISO2022 section */
449
450 #define CODING_ISO_INITIAL(coding, reg) \
451 (XINT (AREF (AREF (CODING_ID_ATTRS ((coding)->id), \
452 coding_attr_iso_initial), \
453 reg)))
454
455
456 #define CODING_ISO_REQUEST(coding, charset_id) \
457 ((charset_id <= (coding)->max_charset_id \
458 ? (coding)->safe_charsets[charset_id] \
459 : -1))
460
461
462 #define CODING_ISO_FLAGS(coding) \
463 ((coding)->spec.iso_2022.flags)
464 #define CODING_ISO_DESIGNATION(coding, reg) \
465 ((coding)->spec.iso_2022.current_designation[reg])
466 #define CODING_ISO_INVOCATION(coding, plane) \
467 ((coding)->spec.iso_2022.current_invocation[plane])
468 #define CODING_ISO_SINGLE_SHIFTING(coding) \
469 ((coding)->spec.iso_2022.single_shifting)
470 #define CODING_ISO_BOL(coding) \
471 ((coding)->spec.iso_2022.bol)
472 #define CODING_ISO_INVOKED_CHARSET(coding, plane) \
473 CODING_ISO_DESIGNATION ((coding), CODING_ISO_INVOCATION ((coding), (plane)))
474
475 /* Control characters of ISO2022. */
476 /* code */ /* function */
477 #define ISO_CODE_LF 0x0A /* line-feed */
478 #define ISO_CODE_CR 0x0D /* carriage-return */
479 #define ISO_CODE_SO 0x0E /* shift-out */
480 #define ISO_CODE_SI 0x0F /* shift-in */
481 #define ISO_CODE_SS2_7 0x19 /* single-shift-2 for 7-bit code */
482 #define ISO_CODE_ESC 0x1B /* escape */
483 #define ISO_CODE_SS2 0x8E /* single-shift-2 */
484 #define ISO_CODE_SS3 0x8F /* single-shift-3 */
485 #define ISO_CODE_CSI 0x9B /* control-sequence-introducer */
486
487 /* All code (1-byte) of ISO2022 is classified into one of the
488 followings. */
489 enum iso_code_class_type
490 {
491 ISO_control_0, /* Control codes in the range
492 0x00..0x1F and 0x7F, except for the
493 following 5 codes. */
494 ISO_carriage_return, /* ISO_CODE_CR (0x0D) */
495 ISO_shift_out, /* ISO_CODE_SO (0x0E) */
496 ISO_shift_in, /* ISO_CODE_SI (0x0F) */
497 ISO_single_shift_2_7, /* ISO_CODE_SS2_7 (0x19) */
498 ISO_escape, /* ISO_CODE_SO (0x1B) */
499 ISO_control_1, /* Control codes in the range
500 0x80..0x9F, except for the
501 following 3 codes. */
502 ISO_single_shift_2, /* ISO_CODE_SS2 (0x8E) */
503 ISO_single_shift_3, /* ISO_CODE_SS3 (0x8F) */
504 ISO_control_sequence_introducer, /* ISO_CODE_CSI (0x9B) */
505 ISO_0x20_or_0x7F, /* Codes of the values 0x20 or 0x7F. */
506 ISO_graphic_plane_0, /* Graphic codes in the range 0x21..0x7E. */
507 ISO_0xA0_or_0xFF, /* Codes of the values 0xA0 or 0xFF. */
508 ISO_graphic_plane_1 /* Graphic codes in the range 0xA1..0xFE. */
509 };
510
511 /** The macros CODING_ISO_FLAG_XXX defines a flag bit of the
512 `iso-flags' attribute of an iso2022 coding system. */
513
514 /* If set, produce long-form designation sequence (e.g. ESC $ ( A)
515 instead of the correct short-form sequence (e.g. ESC $ A). */
516 #define CODING_ISO_FLAG_LONG_FORM 0x0001
517
518 /* If set, reset graphic planes and registers at end-of-line to the
519 initial state. */
520 #define CODING_ISO_FLAG_RESET_AT_EOL 0x0002
521
522 /* If set, reset graphic planes and registers before any control
523 characters to the initial state. */
524 #define CODING_ISO_FLAG_RESET_AT_CNTL 0x0004
525
526 /* If set, encode by 7-bit environment. */
527 #define CODING_ISO_FLAG_SEVEN_BITS 0x0008
528
529 /* If set, use locking-shift function. */
530 #define CODING_ISO_FLAG_LOCKING_SHIFT 0x0010
531
532 /* If set, use single-shift function. Overwrite
533 CODING_ISO_FLAG_LOCKING_SHIFT. */
534 #define CODING_ISO_FLAG_SINGLE_SHIFT 0x0020
535
536 /* If set, use designation escape sequence. */
537 #define CODING_ISO_FLAG_DESIGNATION 0x0040
538
539 /* If set, produce revision number sequence. */
540 #define CODING_ISO_FLAG_REVISION 0x0080
541
542 /* If set, produce ISO6429's direction specifying sequence. */
543 #define CODING_ISO_FLAG_DIRECTION 0x0100
544
545 /* If set, assume designation states are reset at beginning of line on
546 output. */
547 #define CODING_ISO_FLAG_INIT_AT_BOL 0x0200
548
549 /* If set, designation sequence should be placed at beginning of line
550 on output. */
551 #define CODING_ISO_FLAG_DESIGNATE_AT_BOL 0x0400
552
553 /* If set, do not encode unsafe charactes on output. */
554 #define CODING_ISO_FLAG_SAFE 0x0800
555
556 /* If set, extra latin codes (128..159) are accepted as a valid code
557 on input. */
558 #define CODING_ISO_FLAG_LATIN_EXTRA 0x1000
559
560 #define CODING_ISO_FLAG_COMPOSITION 0x2000
561
562 #define CODING_ISO_FLAG_EUC_TW_SHIFT 0x4000
563
564 #define CODING_ISO_FLAG_USE_ROMAN 0x8000
565
566 #define CODING_ISO_FLAG_USE_OLDJIS 0x10000
567
568 #define CODING_ISO_FLAG_FULL_SUPPORT 0x100000
569
570 /* A character to be produced on output if encoding of the original
571 character is prohibited by CODING_ISO_FLAG_SAFE. */
572 #define CODING_INHIBIT_CHARACTER_SUBSTITUTION '?'
573
574
575 /* UTF-16 section */
576 #define CODING_UTF_16_BOM(coding) \
577 ((coding)->spec.utf_16.bom)
578
579 #define CODING_UTF_16_ENDIAN(coding) \
580 ((coding)->spec.utf_16.endian)
581
582 #define CODING_UTF_16_SURROGATE(coding) \
583 ((coding)->spec.utf_16.surrogate)
584
585
586 /* CCL section */
587 #define CODING_CCL_DECODER(coding) \
588 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_decoder)
589 #define CODING_CCL_ENCODER(coding) \
590 AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_encoder)
591 #define CODING_CCL_VALIDS(coding) \
592 (XSTRING (AREF (CODING_ID_ATTRS ((coding)->id), coding_attr_ccl_valids)) \
593 ->data)
594
595 /* Index for each coding category in `coding_categories' */
596
597 enum coding_category
598 {
599 coding_category_iso_7,
600 coding_category_iso_7_tight,
601 coding_category_iso_8_1,
602 coding_category_iso_8_2,
603 coding_category_iso_7_else,
604 coding_category_iso_8_else,
605 coding_category_utf_8,
606 coding_category_utf_16_auto,
607 coding_category_utf_16_be,
608 coding_category_utf_16_le,
609 coding_category_utf_16_be_nosig,
610 coding_category_utf_16_le_nosig,
611 coding_category_charset,
612 coding_category_sjis,
613 coding_category_big5,
614 coding_category_ccl,
615 coding_category_emacs_mule,
616 /* All above are targets of code detection. */
617 coding_category_raw_text,
618 coding_category_undecided,
619 coding_category_max
620 };
621
622 /* Definitions of flag bits used in detect_coding_XXXX. */
623 #define CATEGORY_MASK_ISO_7 (1 << coding_category_iso_7)
624 #define CATEGORY_MASK_ISO_7_TIGHT (1 << coding_category_iso_7_tight)
625 #define CATEGORY_MASK_ISO_8_1 (1 << coding_category_iso_8_1)
626 #define CATEGORY_MASK_ISO_8_2 (1 << coding_category_iso_8_2)
627 #define CATEGORY_MASK_ISO_7_ELSE (1 << coding_category_iso_7_else)
628 #define CATEGORY_MASK_ISO_8_ELSE (1 << coding_category_iso_8_else)
629 #define CATEGORY_MASK_UTF_8 (1 << coding_category_utf_8)
630 #define CATEGORY_MASK_UTF_16_AUTO (1 << coding_category_utf_16_auto)
631 #define CATEGORY_MASK_UTF_16_BE (1 << coding_category_utf_16_be)
632 #define CATEGORY_MASK_UTF_16_LE (1 << coding_category_utf_16_le)
633 #define CATEGORY_MASK_UTF_16_BE_NOSIG (1 << coding_category_utf_16_be_nosig)
634 #define CATEGORY_MASK_UTF_16_LE_NOSIG (1 << coding_category_utf_16_le_nosig)
635 #define CATEGORY_MASK_CHARSET (1 << coding_category_charset)
636 #define CATEGORY_MASK_SJIS (1 << coding_category_sjis)
637 #define CATEGORY_MASK_BIG5 (1 << coding_category_big5)
638 #define CATEGORY_MASK_CCL (1 << coding_category_ccl)
639 #define CATEGORY_MASK_EMACS_MULE (1 << coding_category_emacs_mule)
640 #define CATEGORY_MASK_RAW_TEXT (1 << coding_category_raw_text)
641
642 /* This value is returned if detect_coding_mask () find nothing other
643 than ASCII characters. */
644 #define CATEGORY_MASK_ANY \
645 (CATEGORY_MASK_ISO_7 \
646 | CATEGORY_MASK_ISO_7_TIGHT \
647 | CATEGORY_MASK_ISO_8_1 \
648 | CATEGORY_MASK_ISO_8_2 \
649 | CATEGORY_MASK_ISO_7_ELSE \
650 | CATEGORY_MASK_ISO_8_ELSE \
651 | CATEGORY_MASK_UTF_8 \
652 | CATEGORY_MASK_UTF_16_BE \
653 | CATEGORY_MASK_UTF_16_LE \
654 | CATEGORY_MASK_UTF_16_BE_NOSIG \
655 | CATEGORY_MASK_UTF_16_LE_NOSIG \
656 | CATEGORY_MASK_CHARSET \
657 | CATEGORY_MASK_SJIS \
658 | CATEGORY_MASK_BIG5 \
659 | CATEGORY_MASK_CCL \
660 | CATEGORY_MASK_EMACS_MULE)
661
662
663 #define CATEGORY_MASK_ISO_7BIT \
664 (CATEGORY_MASK_ISO_7 | CATEGORY_MASK_ISO_7_TIGHT)
665
666 #define CATEGORY_MASK_ISO_8BIT \
667 (CATEGORY_MASK_ISO_8_1 | CATEGORY_MASK_ISO_8_2)
668
669 #define CATEGORY_MASK_ISO_ELSE \
670 (CATEGORY_MASK_ISO_7_ELSE | CATEGORY_MASK_ISO_8_ELSE)
671
672 #define CATEGORY_MASK_ISO_ESCAPE \
673 (CATEGORY_MASK_ISO_7 \
674 | CATEGORY_MASK_ISO_7_TIGHT \
675 | CATEGORY_MASK_ISO_7_ELSE \
676 | CATEGORY_MASK_ISO_8_ELSE)
677
678 #define CATEGORY_MASK_ISO \
679 ( CATEGORY_MASK_ISO_7BIT \
680 | CATEGORY_MASK_ISO_8BIT \
681 | CATEGORY_MASK_ISO_ELSE)
682
683 #define CATEGORY_MASK_UTF_16 \
684 (CATEGORY_MASK_UTF_16_BE \
685 | CATEGORY_MASK_UTF_16_LE \
686 | CATEGORY_MASK_UTF_16_BE_NOSIG \
687 | CATEGORY_MASK_UTF_16_LE_NOSIG)
688
689
690 /* List of symbols `coding-category-xxx' ordered by priority. This
691 variable is exposed to Emacs Lisp. */
692 static Lisp_Object Vcoding_category_list;
693
694 /* Table of coding categories (Lisp symbols). This variable is for
695 internal use oly. */
696 static Lisp_Object Vcoding_category_table;
697
698 /* Table of coding-categories ordered by priority. */
699 static enum coding_category coding_priorities[coding_category_max];
700
701 /* Nth element is a coding context for the coding system bound to the
702 Nth coding category. */
703 static struct coding_system coding_categories[coding_category_max];
704
705 /*** Commonly used macros and functions ***/
706
707 #ifndef min
708 #define min(a, b) ((a) < (b) ? (a) : (b))
709 #endif
710 #ifndef max
711 #define max(a, b) ((a) > (b) ? (a) : (b))
712 #endif
713
714 #define CODING_GET_INFO(coding, attrs, eol_type, charset_list) \
715 do { \
716 attrs = CODING_ID_ATTRS (coding->id); \
717 eol_type = CODING_ID_EOL_TYPE (coding->id); \
718 if (VECTORP (eol_type)) \
719 eol_type = Qunix; \
720 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
721 } while (0)
722
723
724 /* Safely get one byte from the source text pointed by SRC which ends
725 at SRC_END, and set C to that byte. If there are not enough bytes
726 in the source, it jumps to `no_more_source'. The caller
727 should declare and set these variables appropriately in advance:
728 src, src_end, multibytep
729 */
730
731 #define ONE_MORE_BYTE(c) \
732 do { \
733 if (src == src_end) \
734 { \
735 if (src_base < src) \
736 coding->result = CODING_RESULT_INSUFFICIENT_SRC; \
737 goto no_more_source; \
738 } \
739 c = *src++; \
740 if (multibytep && (c & 0x80)) \
741 { \
742 if ((c & 0xFE) != 0xC0) \
743 error ("Undecodable char found"); \
744 c = ((c & 1) << 6) | *src++; \
745 } \
746 consumed_chars++; \
747 } while (0)
748
749
750 #define ONE_MORE_BYTE_NO_CHECK(c) \
751 do { \
752 c = *src++; \
753 if (multibytep && (c & 0x80)) \
754 { \
755 if ((c & 0xFE) != 0xC0) \
756 error ("Undecodable char found"); \
757 c = ((c & 1) << 6) | *src++; \
758 } \
759 consumed_chars++; \
760 } while (0)
761
762
763 /* Store a byte C in the place pointed by DST and increment DST to the
764 next free point, and increment PRODUCED_CHARS. The caller should
765 assure that C is 0..127, and declare and set the variable `dst'
766 appropriately in advance.
767 */
768
769
770 #define EMIT_ONE_ASCII_BYTE(c) \
771 do { \
772 produced_chars++; \
773 *dst++ = (c); \
774 } while (0)
775
776
777 /* Like EMIT_ONE_ASCII_BYTE byt store two bytes; C1 and C2. */
778
779 #define EMIT_TWO_ASCII_BYTES(c1, c2) \
780 do { \
781 produced_chars += 2; \
782 *dst++ = (c1), *dst++ = (c2); \
783 } while (0)
784
785
786 /* Store a byte C in the place pointed by DST and increment DST to the
787 next free point, and increment PRODUCED_CHARS. If MULTIBYTEP is
788 nonzero, store in an appropriate multibyte from. The caller should
789 declare and set the variables `dst' and `multibytep' appropriately
790 in advance. */
791
792 #define EMIT_ONE_BYTE(c) \
793 do { \
794 produced_chars++; \
795 if (multibytep) \
796 { \
797 int ch = (c); \
798 if (ch >= 0x80) \
799 ch = BYTE8_TO_CHAR (ch); \
800 CHAR_STRING_ADVANCE (ch, dst); \
801 } \
802 else \
803 *dst++ = (c); \
804 } while (0)
805
806
807 /* Like EMIT_ONE_BYTE, but emit two bytes; C1 and C2. */
808
809 #define EMIT_TWO_BYTES(c1, c2) \
810 do { \
811 produced_chars += 2; \
812 if (multibytep) \
813 { \
814 int ch; \
815 \
816 ch = (c1); \
817 if (ch >= 0x80) \
818 ch = BYTE8_TO_CHAR (ch); \
819 CHAR_STRING_ADVANCE (ch, dst); \
820 ch = (c2); \
821 if (ch >= 0x80) \
822 ch = BYTE8_TO_CHAR (ch); \
823 CHAR_STRING_ADVANCE (ch, dst); \
824 } \
825 else \
826 { \
827 *dst++ = (c1); \
828 *dst++ = (c2); \
829 } \
830 } while (0)
831
832
833 #define EMIT_THREE_BYTES(c1, c2, c3) \
834 do { \
835 EMIT_ONE_BYTE (c1); \
836 EMIT_TWO_BYTES (c2, c3); \
837 } while (0)
838
839
840 #define EMIT_FOUR_BYTES(c1, c2, c3, c4) \
841 do { \
842 EMIT_TWO_BYTES (c1, c2); \
843 EMIT_TWO_BYTES (c3, c4); \
844 } while (0)
845
846
847 #define CODING_DECODE_CHAR(coding, src, src_base, src_end, charset, code, c) \
848 do { \
849 charset_map_loaded = 0; \
850 c = DECODE_CHAR (charset, code); \
851 if (charset_map_loaded) \
852 { \
853 unsigned char *orig = coding->source; \
854 EMACS_INT offset; \
855 \
856 coding_set_source (coding); \
857 offset = coding->source - orig; \
858 src += offset; \
859 src_base += offset; \
860 src_end += offset; \
861 } \
862 } while (0)
863
864
865 #define ASSURE_DESTINATION(bytes) \
866 do { \
867 if (dst + (bytes) >= dst_end) \
868 { \
869 int more_bytes = charbuf_end - charbuf + (bytes); \
870 \
871 dst = alloc_destination (coding, more_bytes, dst); \
872 dst_end = coding->destination + coding->dst_bytes; \
873 } \
874 } while (0)
875
876
877
878 static void
879 coding_set_source (coding)
880 struct coding_system *coding;
881 {
882 if (BUFFERP (coding->src_object))
883 {
884 struct buffer *buf = XBUFFER (coding->src_object);
885
886 if (coding->src_pos < 0)
887 coding->source = BUF_GAP_END_ADDR (buf) + coding->src_pos_byte;
888 else
889 coding->source = BUF_BYTE_ADDRESS (buf, coding->src_pos_byte);
890 }
891 else if (STRINGP (coding->src_object))
892 {
893 coding->source = (XSTRING (coding->src_object)->data
894 + coding->src_pos_byte);
895 }
896 else
897 /* Otherwise, the source is C string and is never relocated
898 automatically. Thus we don't have to update anything. */
899 ;
900 }
901
902 static void
903 coding_set_destination (coding)
904 struct coding_system *coding;
905 {
906 if (BUFFERP (coding->dst_object))
907 {
908 if (coding->src_pos < 0)
909 {
910 coding->destination = BEG_ADDR + coding->dst_pos_byte - 1;
911 coding->dst_bytes = (GAP_END_ADDR
912 - (coding->src_bytes - coding->consumed)
913 - coding->destination);
914 }
915 else
916 {
917 /* We are sure that coding->dst_pos_byte is before the gap
918 of the buffer. */
919 coding->destination = (BUF_BEG_ADDR (XBUFFER (coding->dst_object))
920 + coding->dst_pos_byte - 1);
921 coding->dst_bytes = (BUF_GAP_END_ADDR (XBUFFER (coding->dst_object))
922 - coding->destination);
923 }
924 }
925 else
926 /* Otherwise, the destination is C string and is never relocated
927 automatically. Thus we don't have to update anything. */
928 ;
929 }
930
931
932 static void
933 coding_alloc_by_realloc (coding, bytes)
934 struct coding_system *coding;
935 EMACS_INT bytes;
936 {
937 coding->destination = (unsigned char *) xrealloc (coding->destination,
938 coding->dst_bytes + bytes);
939 coding->dst_bytes += bytes;
940 }
941
942 static void
943 coding_alloc_by_making_gap (coding, bytes)
944 struct coding_system *coding;
945 EMACS_INT bytes;
946 {
947 if (BUFFERP (coding->dst_object)
948 && EQ (coding->src_object, coding->dst_object))
949 {
950 EMACS_INT add = coding->src_bytes - coding->consumed;
951
952 GAP_SIZE -= add; ZV += add; Z += add; ZV_BYTE += add; Z_BYTE += add;
953 make_gap (bytes);
954 GAP_SIZE += add; ZV -= add; Z -= add; ZV_BYTE -= add; Z_BYTE -= add;
955 }
956 else
957 {
958 Lisp_Object this_buffer;
959
960 this_buffer = Fcurrent_buffer ();
961 set_buffer_internal (XBUFFER (coding->dst_object));
962 make_gap (bytes);
963 set_buffer_internal (XBUFFER (this_buffer));
964 }
965 }
966
967
968 static unsigned char *
969 alloc_destination (coding, nbytes, dst)
970 struct coding_system *coding;
971 int nbytes;
972 unsigned char *dst;
973 {
974 EMACS_INT offset = dst - coding->destination;
975
976 if (BUFFERP (coding->dst_object))
977 coding_alloc_by_making_gap (coding, nbytes);
978 else
979 coding_alloc_by_realloc (coding, nbytes);
980 coding->result = CODING_RESULT_SUCCESS;
981 coding_set_destination (coding);
982 dst = coding->destination + offset;
983 return dst;
984 }
985
986 /** Macros for annotations. */
987
988 /* Maximum length of annotation data (sum of annotations for
989 composition and charset). */
990 #define MAX_ANNOTATION_LENGTH (5 + (MAX_COMPOSITION_COMPONENTS * 2) - 1 + 5)
991
992 /* An annotation data is stored in the array coding->charbuf in this
993 format:
994 [ -LENGTH ANNOTATION_MASK FROM TO ... ]
995 LENGTH is the number of elements in the annotation.
996 ANNOTATION_MASK is one of CODING_ANNOTATE_XXX_MASK.
997 FROM and TO specify the range of text annotated. They are relative
998 to coding->src_pos (on encoding) or coding->dst_pos (on decoding).
999
1000 The format of the following elements depend on ANNOTATION_MASK.
1001
1002 In the case of CODING_ANNOTATE_COMPOSITION_MASK, these elements
1003 follows:
1004 ... METHOD [ COMPOSITION-COMPONENTS ... ]
1005 METHOD is one of enum composition_method.
1006 Optionnal COMPOSITION-COMPONENTS are characters and composition
1007 rules.
1008
1009 In the case of CODING_ANNOTATE_CHARSET_MASK, one element CHARSET-ID
1010 follows. */
1011
1012 #define ADD_ANNOTATION_DATA(buf, len, mask, from, to) \
1013 do { \
1014 *(buf)++ = -(len); \
1015 *(buf)++ = (mask); \
1016 *(buf)++ = (from); \
1017 *(buf)++ = (to); \
1018 coding->annotated = 1; \
1019 } while (0);
1020
1021 #define ADD_COMPOSITION_DATA(buf, from, to, method) \
1022 do { \
1023 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_COMPOSITION_MASK, from, to); \
1024 *buf++ = method; \
1025 } while (0)
1026
1027
1028 #define ADD_CHARSET_DATA(buf, from, to, id) \
1029 do { \
1030 ADD_ANNOTATION_DATA (buf, 5, CODING_ANNOTATE_CHARSET_MASK, from, to); \
1031 *buf++ = id; \
1032 } while (0)
1033
1034 \f
1035 /*** 2. Emacs' internal format (emacs-utf-8) ***/
1036
1037
1038
1039 \f
1040 /*** 3. UTF-8 ***/
1041
1042 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1043 Check if a text is encoded in UTF-8. If it is, return 1, else
1044 return 0. */
1045
1046 #define UTF_8_1_OCTET_P(c) ((c) < 0x80)
1047 #define UTF_8_EXTRA_OCTET_P(c) (((c) & 0xC0) == 0x80)
1048 #define UTF_8_2_OCTET_LEADING_P(c) (((c) & 0xE0) == 0xC0)
1049 #define UTF_8_3_OCTET_LEADING_P(c) (((c) & 0xF0) == 0xE0)
1050 #define UTF_8_4_OCTET_LEADING_P(c) (((c) & 0xF8) == 0xF0)
1051 #define UTF_8_5_OCTET_LEADING_P(c) (((c) & 0xFC) == 0xF8)
1052
1053 static int
1054 detect_coding_utf_8 (coding, detect_info)
1055 struct coding_system *coding;
1056 struct coding_detection_info *detect_info;
1057 {
1058 unsigned char *src = coding->source, *src_base = src;
1059 unsigned char *src_end = coding->source + coding->src_bytes;
1060 int multibytep = coding->src_multibyte;
1061 int consumed_chars = 0;
1062 int found = 0;
1063 int incomplete;
1064
1065 detect_info->checked |= CATEGORY_MASK_UTF_8;
1066 /* A coding system of this category is always ASCII compatible. */
1067 src += coding->head_ascii;
1068
1069 while (1)
1070 {
1071 int c, c1, c2, c3, c4;
1072
1073 incomplete = 0;
1074 ONE_MORE_BYTE (c);
1075 if (UTF_8_1_OCTET_P (c))
1076 continue;
1077 incomplete = 1;
1078 ONE_MORE_BYTE (c1);
1079 if (! UTF_8_EXTRA_OCTET_P (c1))
1080 break;
1081 if (UTF_8_2_OCTET_LEADING_P (c))
1082 {
1083 found = CATEGORY_MASK_UTF_8;
1084 continue;
1085 }
1086 ONE_MORE_BYTE (c2);
1087 if (! UTF_8_EXTRA_OCTET_P (c2))
1088 break;
1089 if (UTF_8_3_OCTET_LEADING_P (c))
1090 {
1091 found = CATEGORY_MASK_UTF_8;
1092 continue;
1093 }
1094 ONE_MORE_BYTE (c3);
1095 if (! UTF_8_EXTRA_OCTET_P (c3))
1096 break;
1097 if (UTF_8_4_OCTET_LEADING_P (c))
1098 {
1099 found = CATEGORY_MASK_UTF_8;
1100 continue;
1101 }
1102 ONE_MORE_BYTE (c4);
1103 if (! UTF_8_EXTRA_OCTET_P (c4))
1104 break;
1105 if (UTF_8_5_OCTET_LEADING_P (c))
1106 {
1107 found = CATEGORY_MASK_UTF_8;
1108 continue;
1109 }
1110 break;
1111 }
1112 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1113 return 0;
1114
1115 no_more_source:
1116 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1117 {
1118 detect_info->rejected |= CATEGORY_MASK_UTF_8;
1119 return 0;
1120 }
1121 detect_info->found |= found;
1122 return 1;
1123 }
1124
1125
1126 static void
1127 decode_coding_utf_8 (coding)
1128 struct coding_system *coding;
1129 {
1130 unsigned char *src = coding->source + coding->consumed;
1131 unsigned char *src_end = coding->source + coding->src_bytes;
1132 unsigned char *src_base;
1133 int *charbuf = coding->charbuf;
1134 int *charbuf_end = charbuf + coding->charbuf_size;
1135 int consumed_chars = 0, consumed_chars_base;
1136 int multibytep = coding->src_multibyte;
1137 Lisp_Object attr, eol_type, charset_list;
1138
1139 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1140
1141 while (1)
1142 {
1143 int c, c1, c2, c3, c4, c5;
1144
1145 src_base = src;
1146 consumed_chars_base = consumed_chars;
1147
1148 if (charbuf >= charbuf_end)
1149 break;
1150
1151 ONE_MORE_BYTE (c1);
1152 if (UTF_8_1_OCTET_P(c1))
1153 {
1154 c = c1;
1155 if (c == '\r')
1156 {
1157 if (EQ (eol_type, Qdos))
1158 {
1159 if (src == src_end)
1160 {
1161 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1162 goto no_more_source;
1163 }
1164 if (*src == '\n')
1165 ONE_MORE_BYTE (c);
1166 }
1167 else if (EQ (eol_type, Qmac))
1168 c = '\n';
1169 }
1170 }
1171 else
1172 {
1173 ONE_MORE_BYTE (c2);
1174 if (! UTF_8_EXTRA_OCTET_P (c2))
1175 goto invalid_code;
1176 if (UTF_8_2_OCTET_LEADING_P (c1))
1177 {
1178 c = ((c1 & 0x1F) << 6) | (c2 & 0x3F);
1179 /* Reject overlong sequences here and below. Encoders
1180 producing them are incorrect, they can be misleading,
1181 and they mess up read/write invariance. */
1182 if (c < 128)
1183 goto invalid_code;
1184 }
1185 else
1186 {
1187 ONE_MORE_BYTE (c3);
1188 if (! UTF_8_EXTRA_OCTET_P (c3))
1189 goto invalid_code;
1190 if (UTF_8_3_OCTET_LEADING_P (c1))
1191 {
1192 c = (((c1 & 0xF) << 12)
1193 | ((c2 & 0x3F) << 6) | (c3 & 0x3F));
1194 if (c < 0x800
1195 || (c >= 0xd800 && c < 0xe000)) /* surrogates (invalid) */
1196 goto invalid_code;
1197 }
1198 else
1199 {
1200 ONE_MORE_BYTE (c4);
1201 if (! UTF_8_EXTRA_OCTET_P (c4))
1202 goto invalid_code;
1203 if (UTF_8_4_OCTET_LEADING_P (c1))
1204 {
1205 c = (((c1 & 0x7) << 18) | ((c2 & 0x3F) << 12)
1206 | ((c3 & 0x3F) << 6) | (c4 & 0x3F));
1207 if (c < 0x10000)
1208 goto invalid_code;
1209 }
1210 else
1211 {
1212 ONE_MORE_BYTE (c5);
1213 if (! UTF_8_EXTRA_OCTET_P (c5))
1214 goto invalid_code;
1215 if (UTF_8_5_OCTET_LEADING_P (c1))
1216 {
1217 c = (((c1 & 0x3) << 24) | ((c2 & 0x3F) << 18)
1218 | ((c3 & 0x3F) << 12) | ((c4 & 0x3F) << 6)
1219 | (c5 & 0x3F));
1220 if ((c > MAX_CHAR) || (c < 0x200000))
1221 goto invalid_code;
1222 }
1223 else
1224 goto invalid_code;
1225 }
1226 }
1227 }
1228 }
1229
1230 *charbuf++ = c;
1231 continue;
1232
1233 invalid_code:
1234 src = src_base;
1235 consumed_chars = consumed_chars_base;
1236 ONE_MORE_BYTE (c);
1237 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
1238 coding->errors++;
1239 }
1240
1241 no_more_source:
1242 coding->consumed_char += consumed_chars_base;
1243 coding->consumed = src_base - coding->source;
1244 coding->charbuf_used = charbuf - coding->charbuf;
1245 }
1246
1247
1248 static int
1249 encode_coding_utf_8 (coding)
1250 struct coding_system *coding;
1251 {
1252 int multibytep = coding->dst_multibyte;
1253 int *charbuf = coding->charbuf;
1254 int *charbuf_end = charbuf + coding->charbuf_used;
1255 unsigned char *dst = coding->destination + coding->produced;
1256 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1257 int produced_chars = 0;
1258 int c;
1259
1260 if (multibytep)
1261 {
1262 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
1263
1264 while (charbuf < charbuf_end)
1265 {
1266 unsigned char str[MAX_MULTIBYTE_LENGTH], *p, *pend = str;
1267
1268 ASSURE_DESTINATION (safe_room);
1269 c = *charbuf++;
1270 if (CHAR_BYTE8_P (c))
1271 {
1272 c = CHAR_TO_BYTE8 (c);
1273 EMIT_ONE_BYTE (c);
1274 }
1275 else
1276 {
1277 CHAR_STRING_ADVANCE (c, pend);
1278 for (p = str; p < pend; p++)
1279 EMIT_ONE_BYTE (*p);
1280 }
1281 }
1282 }
1283 else
1284 {
1285 int safe_room = MAX_MULTIBYTE_LENGTH;
1286
1287 while (charbuf < charbuf_end)
1288 {
1289 ASSURE_DESTINATION (safe_room);
1290 c = *charbuf++;
1291 dst += CHAR_STRING (c, dst);
1292 produced_chars++;
1293 }
1294 }
1295 coding->result = CODING_RESULT_SUCCESS;
1296 coding->produced_char += produced_chars;
1297 coding->produced = dst - coding->destination;
1298 return 0;
1299 }
1300
1301
1302 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1303 Check if a text is encoded in one of UTF-16 based coding systems.
1304 If it is, return 1, else return 0. */
1305
1306 #define UTF_16_HIGH_SURROGATE_P(val) \
1307 (((val) & 0xFC00) == 0xD800)
1308
1309 #define UTF_16_LOW_SURROGATE_P(val) \
1310 (((val) & 0xFC00) == 0xDC00)
1311
1312 #define UTF_16_INVALID_P(val) \
1313 (((val) == 0xFFFE) \
1314 || ((val) == 0xFFFF) \
1315 || UTF_16_LOW_SURROGATE_P (val))
1316
1317
1318 static int
1319 detect_coding_utf_16 (coding, detect_info)
1320 struct coding_system *coding;
1321 struct coding_detection_info *detect_info;
1322 {
1323 unsigned char *src = coding->source, *src_base = src;
1324 unsigned char *src_end = coding->source + coding->src_bytes;
1325 int multibytep = coding->src_multibyte;
1326 int consumed_chars = 0;
1327 int c1, c2;
1328
1329 detect_info->checked |= CATEGORY_MASK_UTF_16;
1330
1331 if (coding->mode & CODING_MODE_LAST_BLOCK
1332 && (coding->src_bytes & 1))
1333 {
1334 detect_info->rejected |= CATEGORY_MASK_UTF_16;
1335 return 0;
1336 }
1337 ONE_MORE_BYTE (c1);
1338 ONE_MORE_BYTE (c2);
1339
1340 if ((c1 == 0xFF) && (c2 == 0xFE))
1341 {
1342 detect_info->found |= (CATEGORY_MASK_UTF_16_LE
1343 | CATEGORY_MASK_UTF_16_AUTO);
1344 detect_info->rejected |= CATEGORY_MASK_UTF_16_BE;
1345 }
1346 else if ((c1 == 0xFE) && (c2 == 0xFF))
1347 {
1348 detect_info->found |= (CATEGORY_MASK_UTF_16_BE
1349 | CATEGORY_MASK_UTF_16_AUTO);
1350 detect_info->rejected |= CATEGORY_MASK_UTF_16_LE;
1351 }
1352 no_more_source:
1353 return 1;
1354 }
1355
1356 static void
1357 decode_coding_utf_16 (coding)
1358 struct coding_system *coding;
1359 {
1360 unsigned char *src = coding->source + coding->consumed;
1361 unsigned char *src_end = coding->source + coding->src_bytes;
1362 unsigned char *src_base;
1363 int *charbuf = coding->charbuf;
1364 int *charbuf_end = charbuf + coding->charbuf_size;
1365 int consumed_chars = 0, consumed_chars_base;
1366 int multibytep = coding->src_multibyte;
1367 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1368 enum utf_16_endian_type endian = CODING_UTF_16_ENDIAN (coding);
1369 int surrogate = CODING_UTF_16_SURROGATE (coding);
1370 Lisp_Object attr, eol_type, charset_list;
1371
1372 CODING_GET_INFO (coding, attr, eol_type, charset_list);
1373
1374 if (bom == utf_16_with_bom)
1375 {
1376 int c, c1, c2;
1377
1378 src_base = src;
1379 ONE_MORE_BYTE (c1);
1380 ONE_MORE_BYTE (c2);
1381 c = (c1 << 8) | c2;
1382
1383 if (endian == utf_16_big_endian
1384 ? c != 0xFEFF : c != 0xFFFE)
1385 {
1386 /* The first two bytes are not BOM. Treat them as bytes
1387 for a normal character. */
1388 src = src_base;
1389 coding->errors++;
1390 }
1391 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1392 }
1393 else if (bom == utf_16_detect_bom)
1394 {
1395 /* We have already tried to detect BOM and failed in
1396 detect_coding. */
1397 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1398 }
1399
1400 while (1)
1401 {
1402 int c, c1, c2;
1403
1404 src_base = src;
1405 consumed_chars_base = consumed_chars;
1406
1407 if (charbuf + 2 >= charbuf_end)
1408 break;
1409
1410 ONE_MORE_BYTE (c1);
1411 ONE_MORE_BYTE (c2);
1412 c = (endian == utf_16_big_endian
1413 ? ((c1 << 8) | c2) : ((c2 << 8) | c1));
1414 if (surrogate)
1415 {
1416 if (! UTF_16_LOW_SURROGATE_P (c))
1417 {
1418 if (endian == utf_16_big_endian)
1419 c1 = surrogate >> 8, c2 = surrogate & 0xFF;
1420 else
1421 c1 = surrogate & 0xFF, c2 = surrogate >> 8;
1422 *charbuf++ = c1;
1423 *charbuf++ = c2;
1424 coding->errors++;
1425 if (UTF_16_HIGH_SURROGATE_P (c))
1426 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1427 else
1428 *charbuf++ = c;
1429 }
1430 else
1431 {
1432 c = ((surrogate - 0xD800) << 10) | (c - 0xDC00);
1433 CODING_UTF_16_SURROGATE (coding) = surrogate = 0;
1434 *charbuf++ = c;
1435 }
1436 }
1437 else
1438 {
1439 if (UTF_16_HIGH_SURROGATE_P (c))
1440 CODING_UTF_16_SURROGATE (coding) = surrogate = c;
1441 else
1442 *charbuf++ = c;
1443 }
1444 }
1445
1446 no_more_source:
1447 coding->consumed_char += consumed_chars_base;
1448 coding->consumed = src_base - coding->source;
1449 coding->charbuf_used = charbuf - coding->charbuf;
1450 }
1451
1452 static int
1453 encode_coding_utf_16 (coding)
1454 struct coding_system *coding;
1455 {
1456 int multibytep = coding->dst_multibyte;
1457 int *charbuf = coding->charbuf;
1458 int *charbuf_end = charbuf + coding->charbuf_used;
1459 unsigned char *dst = coding->destination + coding->produced;
1460 unsigned char *dst_end = coding->destination + coding->dst_bytes;
1461 int safe_room = 8;
1462 enum utf_16_bom_type bom = CODING_UTF_16_BOM (coding);
1463 int big_endian = CODING_UTF_16_ENDIAN (coding) == utf_16_big_endian;
1464 int produced_chars = 0;
1465 Lisp_Object attrs, eol_type, charset_list;
1466 int c;
1467
1468 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1469
1470 if (bom != utf_16_without_bom)
1471 {
1472 ASSURE_DESTINATION (safe_room);
1473 if (big_endian)
1474 EMIT_TWO_BYTES (0xFE, 0xFF);
1475 else
1476 EMIT_TWO_BYTES (0xFF, 0xFE);
1477 CODING_UTF_16_BOM (coding) = utf_16_without_bom;
1478 }
1479
1480 while (charbuf < charbuf_end)
1481 {
1482 ASSURE_DESTINATION (safe_room);
1483 c = *charbuf++;
1484 if (c >= MAX_UNICODE_CHAR)
1485 c = coding->default_char;
1486
1487 if (c < 0x10000)
1488 {
1489 if (big_endian)
1490 EMIT_TWO_BYTES (c >> 8, c & 0xFF);
1491 else
1492 EMIT_TWO_BYTES (c & 0xFF, c >> 8);
1493 }
1494 else
1495 {
1496 int c1, c2;
1497
1498 c -= 0x10000;
1499 c1 = (c >> 10) + 0xD800;
1500 c2 = (c & 0x3FF) + 0xDC00;
1501 if (big_endian)
1502 EMIT_FOUR_BYTES (c1 >> 8, c1 & 0xFF, c2 >> 8, c2 & 0xFF);
1503 else
1504 EMIT_FOUR_BYTES (c1 & 0xFF, c1 >> 8, c2 & 0xFF, c2 >> 8);
1505 }
1506 }
1507 coding->result = CODING_RESULT_SUCCESS;
1508 coding->produced = dst - coding->destination;
1509 coding->produced_char += produced_chars;
1510 return 0;
1511 }
1512
1513 \f
1514 /*** 6. Old Emacs' internal format (emacs-mule) ***/
1515
1516 /* Emacs' internal format for representation of multiple character
1517 sets is a kind of multi-byte encoding, i.e. characters are
1518 represented by variable-length sequences of one-byte codes.
1519
1520 ASCII characters and control characters (e.g. `tab', `newline') are
1521 represented by one-byte sequences which are their ASCII codes, in
1522 the range 0x00 through 0x7F.
1523
1524 8-bit characters of the range 0x80..0x9F are represented by
1525 two-byte sequences of LEADING_CODE_8_BIT_CONTROL and (their 8-bit
1526 code + 0x20).
1527
1528 8-bit characters of the range 0xA0..0xFF are represented by
1529 one-byte sequences which are their 8-bit code.
1530
1531 The other characters are represented by a sequence of `base
1532 leading-code', optional `extended leading-code', and one or two
1533 `position-code's. The length of the sequence is determined by the
1534 base leading-code. Leading-code takes the range 0x81 through 0x9D,
1535 whereas extended leading-code and position-code take the range 0xA0
1536 through 0xFF. See `charset.h' for more details about leading-code
1537 and position-code.
1538
1539 --- CODE RANGE of Emacs' internal format ---
1540 character set range
1541 ------------- -----
1542 ascii 0x00..0x7F
1543 eight-bit-control LEADING_CODE_8_BIT_CONTROL + 0xA0..0xBF
1544 eight-bit-graphic 0xA0..0xBF
1545 ELSE 0x81..0x9D + [0xA0..0xFF]+
1546 ---------------------------------------------
1547
1548 As this is the internal character representation, the format is
1549 usually not used externally (i.e. in a file or in a data sent to a
1550 process). But, it is possible to have a text externally in this
1551 format (i.e. by encoding by the coding system `emacs-mule').
1552
1553 In that case, a sequence of one-byte codes has a slightly different
1554 form.
1555
1556 At first, all characters in eight-bit-control are represented by
1557 one-byte sequences which are their 8-bit code.
1558
1559 Next, character composition data are represented by the byte
1560 sequence of the form: 0x80 METHOD BYTES CHARS COMPONENT ...,
1561 where,
1562 METHOD is 0xF0 plus one of composition method (enum
1563 composition_method),
1564
1565 BYTES is 0xA0 plus a byte length of this composition data,
1566
1567 CHARS is 0x20 plus a number of characters composed by this
1568 data,
1569
1570 COMPONENTs are characters of multibye form or composition
1571 rules encoded by two-byte of ASCII codes.
1572
1573 In addition, for backward compatibility, the following formats are
1574 also recognized as composition data on decoding.
1575
1576 0x80 MSEQ ...
1577 0x80 0xFF MSEQ RULE MSEQ RULE ... MSEQ
1578
1579 Here,
1580 MSEQ is a multibyte form but in these special format:
1581 ASCII: 0xA0 ASCII_CODE+0x80,
1582 other: LEADING_CODE+0x20 FOLLOWING-BYTE ...,
1583 RULE is a one byte code of the range 0xA0..0xF0 that
1584 represents a composition rule.
1585 */
1586
1587 char emacs_mule_bytes[256];
1588
1589 int
1590 emacs_mule_char (coding, src, nbytes, nchars, id)
1591 struct coding_system *coding;
1592 unsigned char *src;
1593 int *nbytes, *nchars, *id;
1594 {
1595 unsigned char *src_end = coding->source + coding->src_bytes;
1596 int multibytep = coding->src_multibyte;
1597 unsigned char *src_base = src;
1598 struct charset *charset;
1599 unsigned code;
1600 int c;
1601 int consumed_chars = 0;
1602
1603 ONE_MORE_BYTE (c);
1604 switch (emacs_mule_bytes[c])
1605 {
1606 case 2:
1607 if (! (charset = emacs_mule_charset[c]))
1608 goto invalid_code;
1609 ONE_MORE_BYTE (c);
1610 code = c & 0x7F;
1611 break;
1612
1613 case 3:
1614 if (c == EMACS_MULE_LEADING_CODE_PRIVATE_11
1615 || c == EMACS_MULE_LEADING_CODE_PRIVATE_12)
1616 {
1617 ONE_MORE_BYTE (c);
1618 if (! (charset = emacs_mule_charset[c]))
1619 goto invalid_code;
1620 ONE_MORE_BYTE (c);
1621 code = c & 0x7F;
1622 }
1623 else
1624 {
1625 if (! (charset = emacs_mule_charset[c]))
1626 goto invalid_code;
1627 ONE_MORE_BYTE (c);
1628 code = (c & 0x7F) << 8;
1629 ONE_MORE_BYTE (c);
1630 code |= c & 0x7F;
1631 }
1632 break;
1633
1634 case 4:
1635 ONE_MORE_BYTE (c);
1636 if (! (charset = emacs_mule_charset[c]))
1637 goto invalid_code;
1638 ONE_MORE_BYTE (c);
1639 code = (c & 0x7F) << 8;
1640 ONE_MORE_BYTE (c);
1641 code |= c & 0x7F;
1642 break;
1643
1644 case 1:
1645 code = c;
1646 charset = CHARSET_FROM_ID (ASCII_BYTE_P (code)
1647 ? charset_ascii : charset_eight_bit);
1648 break;
1649
1650 default:
1651 abort ();
1652 }
1653 c = DECODE_CHAR (charset, code);
1654 if (c < 0)
1655 goto invalid_code;
1656 *nbytes = src - src_base;
1657 *nchars = consumed_chars;
1658 if (id)
1659 *id = charset->id;
1660 return c;
1661
1662 no_more_source:
1663 return -2;
1664
1665 invalid_code:
1666 return -1;
1667 }
1668
1669
1670 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
1671 Check if a text is encoded in `emacs-mule'. If it is, return 1,
1672 else return 0. */
1673
1674 static int
1675 detect_coding_emacs_mule (coding, detect_info)
1676 struct coding_system *coding;
1677 struct coding_detection_info *detect_info;
1678 {
1679 unsigned char *src = coding->source, *src_base = src;
1680 unsigned char *src_end = coding->source + coding->src_bytes;
1681 int multibytep = coding->src_multibyte;
1682 int consumed_chars = 0;
1683 int c;
1684 int found = 0;
1685 int incomplete;
1686
1687 detect_info->checked |= CATEGORY_MASK_EMACS_MULE;
1688 /* A coding system of this category is always ASCII compatible. */
1689 src += coding->head_ascii;
1690
1691 while (1)
1692 {
1693 incomplete = 0;
1694 ONE_MORE_BYTE (c);
1695 incomplete = 1;
1696
1697 if (c == 0x80)
1698 {
1699 /* Perhaps the start of composite character. We simple skip
1700 it because analyzing it is too heavy for detecting. But,
1701 at least, we check that the composite character
1702 constitues of more than 4 bytes. */
1703 unsigned char *src_base;
1704
1705 repeat:
1706 src_base = src;
1707 do
1708 {
1709 ONE_MORE_BYTE (c);
1710 }
1711 while (c >= 0xA0);
1712
1713 if (src - src_base <= 4)
1714 break;
1715 found = CATEGORY_MASK_EMACS_MULE;
1716 if (c == 0x80)
1717 goto repeat;
1718 }
1719
1720 if (c < 0x80)
1721 {
1722 if (c < 0x20
1723 && (c == ISO_CODE_ESC || c == ISO_CODE_SI || c == ISO_CODE_SO))
1724 break;
1725 }
1726 else
1727 {
1728 unsigned char *src_base = src - 1;
1729
1730 do
1731 {
1732 ONE_MORE_BYTE (c);
1733 }
1734 while (c >= 0xA0);
1735 if (src - src_base != emacs_mule_bytes[*src_base])
1736 break;
1737 found = CATEGORY_MASK_EMACS_MULE;
1738 }
1739 }
1740 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1741 return 0;
1742
1743 no_more_source:
1744 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
1745 {
1746 detect_info->rejected |= CATEGORY_MASK_EMACS_MULE;
1747 return 0;
1748 }
1749 detect_info->found |= found;
1750 return 1;
1751 }
1752
1753
1754 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
1755
1756 /* Decode a character represented as a component of composition
1757 sequence of Emacs 20/21 style at SRC. Set C to that character and
1758 update SRC to the head of next character (or an encoded composition
1759 rule). If SRC doesn't points a composition component, set C to -1.
1760 If SRC points an invalid byte sequence, global exit by a return
1761 value 0. */
1762
1763 #define DECODE_EMACS_MULE_COMPOSITION_CHAR(buf) \
1764 if (1) \
1765 { \
1766 int c; \
1767 int nbytes, nchars; \
1768 \
1769 if (src == src_end) \
1770 break; \
1771 c = emacs_mule_char (coding, src, &nbytes, &nchars, NULL);\
1772 if (c < 0) \
1773 { \
1774 if (c == -2) \
1775 break; \
1776 goto invalid_code; \
1777 } \
1778 *buf++ = c; \
1779 src += nbytes; \
1780 consumed_chars += nchars; \
1781 } \
1782 else
1783
1784
1785 /* Decode a composition rule represented as a component of composition
1786 sequence of Emacs 20 style at SRC. Store the decoded rule in *BUF,
1787 and increment BUF. If SRC points an invalid byte sequence, set C
1788 to -1. */
1789
1790 #define DECODE_EMACS_MULE_COMPOSITION_RULE_20(buf) \
1791 do { \
1792 int c, gref, nref; \
1793 \
1794 if (src >= src_end) \
1795 goto invalid_code; \
1796 ONE_MORE_BYTE_NO_CHECK (c); \
1797 c -= 0x20; \
1798 if (c < 0 || c >= 81) \
1799 goto invalid_code; \
1800 \
1801 gref = c / 9, nref = c % 9; \
1802 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1803 } while (0)
1804
1805
1806 /* Decode a composition rule represented as a component of composition
1807 sequence of Emacs 21 style at SRC. Store the decoded rule in *BUF,
1808 and increment BUF. If SRC points an invalid byte sequence, set C
1809 to -1. */
1810
1811 #define DECODE_EMACS_MULE_COMPOSITION_RULE_21(buf) \
1812 do { \
1813 int gref, nref; \
1814 \
1815 if (src + 1>= src_end) \
1816 goto invalid_code; \
1817 ONE_MORE_BYTE_NO_CHECK (gref); \
1818 gref -= 0x20; \
1819 ONE_MORE_BYTE_NO_CHECK (nref); \
1820 nref -= 0x20; \
1821 if (gref < 0 || gref >= 81 \
1822 || nref < 0 || nref >= 81) \
1823 goto invalid_code; \
1824 *buf++ = COMPOSITION_ENCODE_RULE (gref, nref); \
1825 } while (0)
1826
1827
1828 #define DECODE_EMACS_MULE_21_COMPOSITION(c) \
1829 do { \
1830 /* Emacs 21 style format. The first three bytes at SRC are \
1831 (METHOD - 0xF2), (BYTES - 0xA0), (CHARS - 0xA0), where BYTES is \
1832 the byte length of this composition information, CHARS is the \
1833 number of characters composed by this composition. */ \
1834 enum composition_method method = c - 0xF2; \
1835 int *charbuf_base = charbuf; \
1836 int from, to; \
1837 int consumed_chars_limit; \
1838 int nbytes, nchars; \
1839 \
1840 ONE_MORE_BYTE (c); \
1841 nbytes = c - 0xA0; \
1842 if (nbytes < 3) \
1843 goto invalid_code; \
1844 ONE_MORE_BYTE (c); \
1845 nchars = c - 0xA0; \
1846 from = coding->produced + char_offset; \
1847 to = from + nchars; \
1848 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1849 consumed_chars_limit = consumed_chars_base + nbytes; \
1850 if (method != COMPOSITION_RELATIVE) \
1851 { \
1852 int i = 0; \
1853 while (consumed_chars < consumed_chars_limit) \
1854 { \
1855 if (i % 2 && method != COMPOSITION_WITH_ALTCHARS) \
1856 DECODE_EMACS_MULE_COMPOSITION_RULE_21 (charbuf); \
1857 else \
1858 DECODE_EMACS_MULE_COMPOSITION_CHAR (charbuf); \
1859 i++; \
1860 } \
1861 if (consumed_chars < consumed_chars_limit) \
1862 goto invalid_code; \
1863 charbuf_base[0] -= i; \
1864 } \
1865 } while (0)
1866
1867
1868 #define DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION(c) \
1869 do { \
1870 /* Emacs 20 style format for relative composition. */ \
1871 /* Store multibyte form of characters to be composed. */ \
1872 enum composition_method method = COMPOSITION_RELATIVE; \
1873 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1874 int *buf = components; \
1875 int i, j; \
1876 int from, to; \
1877 \
1878 src = src_base; \
1879 ONE_MORE_BYTE (c); /* skip 0x80 */ \
1880 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1881 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1882 if (i < 2) \
1883 goto invalid_code; \
1884 from = coding->produced_char + char_offset; \
1885 to = from + i; \
1886 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
1887 for (j = 0; j < i; j++) \
1888 *charbuf++ = components[j]; \
1889 } while (0)
1890
1891
1892 #define DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION(c) \
1893 do { \
1894 /* Emacs 20 style format for rule-base composition. */ \
1895 /* Store multibyte form of characters to be composed. */ \
1896 enum composition_method method = COMPOSITION_WITH_RULE; \
1897 int components[MAX_COMPOSITION_COMPONENTS * 2 - 1]; \
1898 int *buf = components; \
1899 int i, j; \
1900 int from, to; \
1901 \
1902 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1903 for (i = 0; i < MAX_COMPOSITION_COMPONENTS; i++) \
1904 { \
1905 DECODE_EMACS_MULE_COMPOSITION_RULE_20 (buf); \
1906 DECODE_EMACS_MULE_COMPOSITION_CHAR (buf); \
1907 } \
1908 if (i < 1 || (buf - components) % 2 == 0) \
1909 goto invalid_code; \
1910 if (charbuf + i + (i / 2) + 1 < charbuf_end) \
1911 goto no_more_source; \
1912 from = coding->produced_char + char_offset; \
1913 to = from + i; \
1914 ADD_COMPOSITION_DATA (buf, from, to, method); \
1915 for (j = 0; j < i; j++) \
1916 *charbuf++ = components[j]; \
1917 for (j = 0; j < i; j += 2) \
1918 *charbuf++ = components[j]; \
1919 } while (0)
1920
1921
1922 static void
1923 decode_coding_emacs_mule (coding)
1924 struct coding_system *coding;
1925 {
1926 unsigned char *src = coding->source + coding->consumed;
1927 unsigned char *src_end = coding->source + coding->src_bytes;
1928 unsigned char *src_base;
1929 int *charbuf = coding->charbuf;
1930 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
1931 int consumed_chars = 0, consumed_chars_base;
1932 int multibytep = coding->src_multibyte;
1933 Lisp_Object attrs, eol_type, charset_list;
1934 int char_offset = coding->produced_char;
1935 int last_offset = char_offset;
1936 int last_id = charset_ascii;
1937
1938 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
1939
1940 while (1)
1941 {
1942 int c;
1943
1944 src_base = src;
1945 consumed_chars_base = consumed_chars;
1946
1947 if (charbuf >= charbuf_end)
1948 break;
1949
1950 ONE_MORE_BYTE (c);
1951
1952 if (c < 0x80)
1953 {
1954 if (c == '\r')
1955 {
1956 if (EQ (eol_type, Qdos))
1957 {
1958 if (src == src_end)
1959 {
1960 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
1961 goto no_more_source;
1962 }
1963 if (*src == '\n')
1964 ONE_MORE_BYTE (c);
1965 }
1966 else if (EQ (eol_type, Qmac))
1967 c = '\n';
1968 }
1969 *charbuf++ = c;
1970 char_offset++;
1971 }
1972 else if (c == 0x80)
1973 {
1974 ONE_MORE_BYTE (c);
1975 if (c - 0xF2 >= COMPOSITION_RELATIVE
1976 && c - 0xF2 <= COMPOSITION_WITH_RULE_ALTCHARS)
1977 DECODE_EMACS_MULE_21_COMPOSITION (c);
1978 else if (c < 0xC0)
1979 DECODE_EMACS_MULE_20_RELATIVE_COMPOSITION (c);
1980 else if (c == 0xFF)
1981 DECODE_EMACS_MULE_20_RULEBASE_COMPOSITION (c);
1982 else
1983 goto invalid_code;
1984 }
1985 else if (c < 0xA0 && emacs_mule_bytes[c] > 1)
1986 {
1987 int nbytes, nchars;
1988 int id;
1989
1990 src = src_base;
1991 consumed_chars = consumed_chars_base;
1992 c = emacs_mule_char (coding, src, &nbytes, &nchars, &id);
1993 if (c < 0)
1994 {
1995 if (c == -2)
1996 break;
1997 goto invalid_code;
1998 }
1999 if (last_id != id)
2000 {
2001 if (last_id != charset_ascii)
2002 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2003 last_id = id;
2004 last_offset = char_offset;
2005 }
2006 *charbuf++ = c;
2007 src += nbytes;
2008 consumed_chars += nchars;
2009 char_offset++;
2010 }
2011 continue;
2012
2013 invalid_code:
2014 src = src_base;
2015 consumed_chars = consumed_chars_base;
2016 ONE_MORE_BYTE (c);
2017 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
2018 char_offset++;
2019 coding->errors++;
2020 }
2021
2022 no_more_source:
2023 if (last_id != charset_ascii)
2024 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
2025 coding->consumed_char += consumed_chars_base;
2026 coding->consumed = src_base - coding->source;
2027 coding->charbuf_used = charbuf - coding->charbuf;
2028 }
2029
2030
2031 #define EMACS_MULE_LEADING_CODES(id, codes) \
2032 do { \
2033 if (id < 0xA0) \
2034 codes[0] = id, codes[1] = 0; \
2035 else if (id < 0xE0) \
2036 codes[0] = 0x9A, codes[1] = id; \
2037 else if (id < 0xF0) \
2038 codes[0] = 0x9B, codes[1] = id; \
2039 else if (id < 0xF5) \
2040 codes[0] = 0x9C, codes[1] = id; \
2041 else \
2042 codes[0] = 0x9D, codes[1] = id; \
2043 } while (0);
2044
2045
2046 static int
2047 encode_coding_emacs_mule (coding)
2048 struct coding_system *coding;
2049 {
2050 int multibytep = coding->dst_multibyte;
2051 int *charbuf = coding->charbuf;
2052 int *charbuf_end = charbuf + coding->charbuf_used;
2053 unsigned char *dst = coding->destination + coding->produced;
2054 unsigned char *dst_end = coding->destination + coding->dst_bytes;
2055 int safe_room = 8;
2056 int produced_chars = 0;
2057 Lisp_Object attrs, eol_type, charset_list;
2058 int c;
2059 int preferred_charset_id = -1;
2060
2061 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2062
2063 while (charbuf < charbuf_end)
2064 {
2065 ASSURE_DESTINATION (safe_room);
2066 c = *charbuf++;
2067
2068 if (c < 0)
2069 {
2070 /* Handle an annotation. */
2071 switch (*charbuf)
2072 {
2073 case CODING_ANNOTATE_COMPOSITION_MASK:
2074 /* Not yet implemented. */
2075 break;
2076 case CODING_ANNOTATE_CHARSET_MASK:
2077 preferred_charset_id = charbuf[3];
2078 if (preferred_charset_id >= 0
2079 && NILP (Fmemq (make_number (preferred_charset_id),
2080 charset_list)))
2081 preferred_charset_id = -1;
2082 break;
2083 default:
2084 abort ();
2085 }
2086 charbuf += -c - 1;
2087 continue;
2088 }
2089
2090 if (ASCII_CHAR_P (c))
2091 EMIT_ONE_ASCII_BYTE (c);
2092 else if (CHAR_BYTE8_P (c))
2093 {
2094 c = CHAR_TO_BYTE8 (c);
2095 EMIT_ONE_BYTE (c);
2096 }
2097 else
2098 {
2099 struct charset *charset;
2100 unsigned code;
2101 int dimension;
2102 int emacs_mule_id;
2103 unsigned char leading_codes[2];
2104
2105 if (preferred_charset_id >= 0)
2106 {
2107 charset = CHARSET_FROM_ID (preferred_charset_id);
2108 if (! CHAR_CHARSET_P (c, charset))
2109 charset = char_charset (c, charset_list, NULL);
2110 }
2111 else
2112 charset = char_charset (c, charset_list, &code);
2113 if (! charset)
2114 {
2115 c = coding->default_char;
2116 if (ASCII_CHAR_P (c))
2117 {
2118 EMIT_ONE_ASCII_BYTE (c);
2119 continue;
2120 }
2121 charset = char_charset (c, charset_list, &code);
2122 }
2123 dimension = CHARSET_DIMENSION (charset);
2124 emacs_mule_id = CHARSET_EMACS_MULE_ID (charset);
2125 EMACS_MULE_LEADING_CODES (emacs_mule_id, leading_codes);
2126 EMIT_ONE_BYTE (leading_codes[0]);
2127 if (leading_codes[1])
2128 EMIT_ONE_BYTE (leading_codes[1]);
2129 if (dimension == 1)
2130 EMIT_ONE_BYTE (code);
2131 else
2132 {
2133 EMIT_ONE_BYTE (code >> 8);
2134 EMIT_ONE_BYTE (code & 0xFF);
2135 }
2136 }
2137 }
2138 coding->result = CODING_RESULT_SUCCESS;
2139 coding->produced_char += produced_chars;
2140 coding->produced = dst - coding->destination;
2141 return 0;
2142 }
2143
2144 \f
2145 /*** 7. ISO2022 handlers ***/
2146
2147 /* The following note describes the coding system ISO2022 briefly.
2148 Since the intention of this note is to help understand the
2149 functions in this file, some parts are NOT ACCURATE or are OVERLY
2150 SIMPLIFIED. For thorough understanding, please refer to the
2151 original document of ISO2022. This is equivalent to the standard
2152 ECMA-35, obtainable from <URL:http://www.ecma.ch/> (*).
2153
2154 ISO2022 provides many mechanisms to encode several character sets
2155 in 7-bit and 8-bit environments. For 7-bit environments, all text
2156 is encoded using bytes less than 128. This may make the encoded
2157 text a little bit longer, but the text passes more easily through
2158 several types of gateway, some of which strip off the MSB (Most
2159 Significant Bit).
2160
2161 There are two kinds of character sets: control character sets and
2162 graphic character sets. The former contain control characters such
2163 as `newline' and `escape' to provide control functions (control
2164 functions are also provided by escape sequences). The latter
2165 contain graphic characters such as 'A' and '-'. Emacs recognizes
2166 two control character sets and many graphic character sets.
2167
2168 Graphic character sets are classified into one of the following
2169 four classes, according to the number of bytes (DIMENSION) and
2170 number of characters in one dimension (CHARS) of the set:
2171 - DIMENSION1_CHARS94
2172 - DIMENSION1_CHARS96
2173 - DIMENSION2_CHARS94
2174 - DIMENSION2_CHARS96
2175
2176 In addition, each character set is assigned an identification tag,
2177 unique for each set, called the "final character" (denoted as <F>
2178 hereafter). The <F> of each character set is decided by ECMA(*)
2179 when it is registered in ISO. The code range of <F> is 0x30..0x7F
2180 (0x30..0x3F are for private use only).
2181
2182 Note (*): ECMA = European Computer Manufacturers Association
2183
2184 Here are examples of graphic character sets [NAME(<F>)]:
2185 o DIMENSION1_CHARS94 -- ASCII('B'), right-half-of-JISX0201('I'), ...
2186 o DIMENSION1_CHARS96 -- right-half-of-ISO8859-1('A'), ...
2187 o DIMENSION2_CHARS94 -- GB2312('A'), JISX0208('B'), ...
2188 o DIMENSION2_CHARS96 -- none for the moment
2189
2190 A code area (1 byte=8 bits) is divided into 4 areas, C0, GL, C1, and GR.
2191 C0 [0x00..0x1F] -- control character plane 0
2192 GL [0x20..0x7F] -- graphic character plane 0
2193 C1 [0x80..0x9F] -- control character plane 1
2194 GR [0xA0..0xFF] -- graphic character plane 1
2195
2196 A control character set is directly designated and invoked to C0 or
2197 C1 by an escape sequence. The most common case is that:
2198 - ISO646's control character set is designated/invoked to C0, and
2199 - ISO6429's control character set is designated/invoked to C1,
2200 and usually these designations/invocations are omitted in encoded
2201 text. In a 7-bit environment, only C0 can be used, and a control
2202 character for C1 is encoded by an appropriate escape sequence to
2203 fit into the environment. All control characters for C1 are
2204 defined to have corresponding escape sequences.
2205
2206 A graphic character set is at first designated to one of four
2207 graphic registers (G0 through G3), then these graphic registers are
2208 invoked to GL or GR. These designations and invocations can be
2209 done independently. The most common case is that G0 is invoked to
2210 GL, G1 is invoked to GR, and ASCII is designated to G0. Usually
2211 these invocations and designations are omitted in encoded text.
2212 In a 7-bit environment, only GL can be used.
2213
2214 When a graphic character set of CHARS94 is invoked to GL, codes
2215 0x20 and 0x7F of the GL area work as control characters SPACE and
2216 DEL respectively, and codes 0xA0 and 0xFF of the GR area should not
2217 be used.
2218
2219 There are two ways of invocation: locking-shift and single-shift.
2220 With locking-shift, the invocation lasts until the next different
2221 invocation, whereas with single-shift, the invocation affects the
2222 following character only and doesn't affect the locking-shift
2223 state. Invocations are done by the following control characters or
2224 escape sequences:
2225
2226 ----------------------------------------------------------------------
2227 abbrev function cntrl escape seq description
2228 ----------------------------------------------------------------------
2229 SI/LS0 (shift-in) 0x0F none invoke G0 into GL
2230 SO/LS1 (shift-out) 0x0E none invoke G1 into GL
2231 LS2 (locking-shift-2) none ESC 'n' invoke G2 into GL
2232 LS3 (locking-shift-3) none ESC 'o' invoke G3 into GL
2233 LS1R (locking-shift-1 right) none ESC '~' invoke G1 into GR (*)
2234 LS2R (locking-shift-2 right) none ESC '}' invoke G2 into GR (*)
2235 LS3R (locking-shift 3 right) none ESC '|' invoke G3 into GR (*)
2236 SS2 (single-shift-2) 0x8E ESC 'N' invoke G2 for one char
2237 SS3 (single-shift-3) 0x8F ESC 'O' invoke G3 for one char
2238 ----------------------------------------------------------------------
2239 (*) These are not used by any known coding system.
2240
2241 Control characters for these functions are defined by macros
2242 ISO_CODE_XXX in `coding.h'.
2243
2244 Designations are done by the following escape sequences:
2245 ----------------------------------------------------------------------
2246 escape sequence description
2247 ----------------------------------------------------------------------
2248 ESC '(' <F> designate DIMENSION1_CHARS94<F> to G0
2249 ESC ')' <F> designate DIMENSION1_CHARS94<F> to G1
2250 ESC '*' <F> designate DIMENSION1_CHARS94<F> to G2
2251 ESC '+' <F> designate DIMENSION1_CHARS94<F> to G3
2252 ESC ',' <F> designate DIMENSION1_CHARS96<F> to G0 (*)
2253 ESC '-' <F> designate DIMENSION1_CHARS96<F> to G1
2254 ESC '.' <F> designate DIMENSION1_CHARS96<F> to G2
2255 ESC '/' <F> designate DIMENSION1_CHARS96<F> to G3
2256 ESC '$' '(' <F> designate DIMENSION2_CHARS94<F> to G0 (**)
2257 ESC '$' ')' <F> designate DIMENSION2_CHARS94<F> to G1
2258 ESC '$' '*' <F> designate DIMENSION2_CHARS94<F> to G2
2259 ESC '$' '+' <F> designate DIMENSION2_CHARS94<F> to G3
2260 ESC '$' ',' <F> designate DIMENSION2_CHARS96<F> to G0 (*)
2261 ESC '$' '-' <F> designate DIMENSION2_CHARS96<F> to G1
2262 ESC '$' '.' <F> designate DIMENSION2_CHARS96<F> to G2
2263 ESC '$' '/' <F> designate DIMENSION2_CHARS96<F> to G3
2264 ----------------------------------------------------------------------
2265
2266 In this list, "DIMENSION1_CHARS94<F>" means a graphic character set
2267 of dimension 1, chars 94, and final character <F>, etc...
2268
2269 Note (*): Although these designations are not allowed in ISO2022,
2270 Emacs accepts them on decoding, and produces them on encoding
2271 CHARS96 character sets in a coding system which is characterized as
2272 7-bit environment, non-locking-shift, and non-single-shift.
2273
2274 Note (**): If <F> is '@', 'A', or 'B', the intermediate character
2275 '(' must be omitted. We refer to this as "short-form" hereafter.
2276
2277 Now you may notice that there are a lot of ways of encoding the
2278 same multilingual text in ISO2022. Actually, there exist many
2279 coding systems such as Compound Text (used in X11's inter client
2280 communication, ISO-2022-JP (used in Japanese Internet), ISO-2022-KR
2281 (used in Korean Internet), EUC (Extended UNIX Code, used in Asian
2282 localized platforms), and all of these are variants of ISO2022.
2283
2284 In addition to the above, Emacs handles two more kinds of escape
2285 sequences: ISO6429's direction specification and Emacs' private
2286 sequence for specifying character composition.
2287
2288 ISO6429's direction specification takes the following form:
2289 o CSI ']' -- end of the current direction
2290 o CSI '0' ']' -- end of the current direction
2291 o CSI '1' ']' -- start of left-to-right text
2292 o CSI '2' ']' -- start of right-to-left text
2293 The control character CSI (0x9B: control sequence introducer) is
2294 abbreviated to the escape sequence ESC '[' in a 7-bit environment.
2295
2296 Character composition specification takes the following form:
2297 o ESC '0' -- start relative composition
2298 o ESC '1' -- end composition
2299 o ESC '2' -- start rule-base composition (*)
2300 o ESC '3' -- start relative composition with alternate chars (**)
2301 o ESC '4' -- start rule-base composition with alternate chars (**)
2302 Since these are not standard escape sequences of any ISO standard,
2303 the use of them with these meanings is restricted to Emacs only.
2304
2305 (*) This form is used only in Emacs 20.7 and older versions,
2306 but newer versions can safely decode it.
2307 (**) This form is used only in Emacs 21.1 and newer versions,
2308 and older versions can't decode it.
2309
2310 Here's a list of example usages of these composition escape
2311 sequences (categorized by `enum composition_method').
2312
2313 COMPOSITION_RELATIVE:
2314 ESC 0 CHAR [ CHAR ] ESC 1
2315 COMPOSITION_WITH_RULE:
2316 ESC 2 CHAR [ RULE CHAR ] ESC 1
2317 COMPOSITION_WITH_ALTCHARS:
2318 ESC 3 ALTCHAR [ ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1
2319 COMPOSITION_WITH_RULE_ALTCHARS:
2320 ESC 4 ALTCHAR [ RULE ALTCHAR ] ESC 0 CHAR [ CHAR ] ESC 1 */
2321
2322 enum iso_code_class_type iso_code_class[256];
2323
2324 #define SAFE_CHARSET_P(coding, id) \
2325 ((id) <= (coding)->max_charset_id \
2326 && (coding)->safe_charsets[id] >= 0)
2327
2328
2329 #define SHIFT_OUT_OK(category) \
2330 (CODING_ISO_INITIAL (&coding_categories[category], 1) >= 0)
2331
2332 static void
2333 setup_iso_safe_charsets (attrs)
2334 Lisp_Object attrs;
2335 {
2336 Lisp_Object charset_list, safe_charsets;
2337 Lisp_Object request;
2338 Lisp_Object reg_usage;
2339 Lisp_Object tail;
2340 int reg94, reg96;
2341 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
2342 int max_charset_id;
2343
2344 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
2345 if ((flags & CODING_ISO_FLAG_FULL_SUPPORT)
2346 && ! EQ (charset_list, Viso_2022_charset_list))
2347 {
2348 CODING_ATTR_CHARSET_LIST (attrs)
2349 = charset_list = Viso_2022_charset_list;
2350 ASET (attrs, coding_attr_safe_charsets, Qnil);
2351 }
2352
2353 if (STRINGP (AREF (attrs, coding_attr_safe_charsets)))
2354 return;
2355
2356 max_charset_id = 0;
2357 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2358 {
2359 int id = XINT (XCAR (tail));
2360 if (max_charset_id < id)
2361 max_charset_id = id;
2362 }
2363
2364 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
2365 make_number (255));
2366 request = AREF (attrs, coding_attr_iso_request);
2367 reg_usage = AREF (attrs, coding_attr_iso_usage);
2368 reg94 = XINT (XCAR (reg_usage));
2369 reg96 = XINT (XCDR (reg_usage));
2370
2371 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
2372 {
2373 Lisp_Object id;
2374 Lisp_Object reg;
2375 struct charset *charset;
2376
2377 id = XCAR (tail);
2378 charset = CHARSET_FROM_ID (XINT (id));
2379 reg = Fcdr (Fassq (id, request));
2380 if (! NILP (reg))
2381 XSTRING (safe_charsets)->data[XINT (id)] = XINT (reg);
2382 else if (charset->iso_chars_96)
2383 {
2384 if (reg96 < 4)
2385 XSTRING (safe_charsets)->data[XINT (id)] = reg96;
2386 }
2387 else
2388 {
2389 if (reg94 < 4)
2390 XSTRING (safe_charsets)->data[XINT (id)] = reg94;
2391 }
2392 }
2393 ASET (attrs, coding_attr_safe_charsets, safe_charsets);
2394 }
2395
2396
2397 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
2398 Check if a text is encoded in one of ISO-2022 based codig systems.
2399 If it is, return 1, else return 0. */
2400
2401 static int
2402 detect_coding_iso_2022 (coding, detect_info)
2403 struct coding_system *coding;
2404 struct coding_detection_info *detect_info;
2405 {
2406 unsigned char *src = coding->source, *src_base = src;
2407 unsigned char *src_end = coding->source + coding->src_bytes;
2408 int multibytep = coding->src_multibyte;
2409 int single_shifting = 0;
2410 int id;
2411 int c, c1;
2412 int consumed_chars = 0;
2413 int i;
2414 int rejected = 0;
2415 int found = 0;
2416
2417 detect_info->checked |= CATEGORY_MASK_ISO;
2418
2419 for (i = coding_category_iso_7; i <= coding_category_iso_8_else; i++)
2420 {
2421 struct coding_system *this = &(coding_categories[i]);
2422 Lisp_Object attrs, val;
2423
2424 attrs = CODING_ID_ATTRS (this->id);
2425 if (CODING_ISO_FLAGS (this) & CODING_ISO_FLAG_FULL_SUPPORT
2426 && ! EQ (CODING_ATTR_SAFE_CHARSETS (attrs), Viso_2022_charset_list))
2427 setup_iso_safe_charsets (attrs);
2428 val = CODING_ATTR_SAFE_CHARSETS (attrs);
2429 this->max_charset_id = XSTRING (val)->size - 1;
2430 this->safe_charsets = (char *) XSTRING (val)->data;
2431 }
2432
2433 /* A coding system of this category is always ASCII compatible. */
2434 src += coding->head_ascii;
2435
2436 while (rejected != CATEGORY_MASK_ISO)
2437 {
2438 ONE_MORE_BYTE (c);
2439 switch (c)
2440 {
2441 case ISO_CODE_ESC:
2442 if (inhibit_iso_escape_detection)
2443 break;
2444 single_shifting = 0;
2445 ONE_MORE_BYTE (c);
2446 if (c >= '(' && c <= '/')
2447 {
2448 /* Designation sequence for a charset of dimension 1. */
2449 ONE_MORE_BYTE (c1);
2450 if (c1 < ' ' || c1 >= 0x80
2451 || (id = iso_charset_table[0][c >= ','][c1]) < 0)
2452 /* Invalid designation sequence. Just ignore. */
2453 break;
2454 }
2455 else if (c == '$')
2456 {
2457 /* Designation sequence for a charset of dimension 2. */
2458 ONE_MORE_BYTE (c);
2459 if (c >= '@' && c <= 'B')
2460 /* Designation for JISX0208.1978, GB2312, or JISX0208. */
2461 id = iso_charset_table[1][0][c];
2462 else if (c >= '(' && c <= '/')
2463 {
2464 ONE_MORE_BYTE (c1);
2465 if (c1 < ' ' || c1 >= 0x80
2466 || (id = iso_charset_table[1][c >= ','][c1]) < 0)
2467 /* Invalid designation sequence. Just ignore. */
2468 break;
2469 }
2470 else
2471 /* Invalid designation sequence. Just ignore it. */
2472 break;
2473 }
2474 else if (c == 'N' || c == 'O')
2475 {
2476 /* ESC <Fe> for SS2 or SS3. */
2477 single_shifting = 1;
2478 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2479 break;
2480 }
2481 else if (c >= '0' && c <= '4')
2482 {
2483 /* ESC <Fp> for start/end composition. */
2484 found |= CATEGORY_MASK_ISO;
2485 break;
2486 }
2487 else
2488 {
2489 /* Invalid escape sequence. Just ignore it. */
2490 break;
2491 }
2492
2493 /* We found a valid designation sequence for CHARSET. */
2494 rejected |= CATEGORY_MASK_ISO_8BIT;
2495 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7],
2496 id))
2497 found |= CATEGORY_MASK_ISO_7;
2498 else
2499 rejected |= CATEGORY_MASK_ISO_7;
2500 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_tight],
2501 id))
2502 found |= CATEGORY_MASK_ISO_7_TIGHT;
2503 else
2504 rejected |= CATEGORY_MASK_ISO_7_TIGHT;
2505 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_7_else],
2506 id))
2507 found |= CATEGORY_MASK_ISO_7_ELSE;
2508 else
2509 rejected |= CATEGORY_MASK_ISO_7_ELSE;
2510 if (SAFE_CHARSET_P (&coding_categories[coding_category_iso_8_else],
2511 id))
2512 found |= CATEGORY_MASK_ISO_8_ELSE;
2513 else
2514 rejected |= CATEGORY_MASK_ISO_8_ELSE;
2515 break;
2516
2517 case ISO_CODE_SO:
2518 case ISO_CODE_SI:
2519 /* Locking shift out/in. */
2520 if (inhibit_iso_escape_detection)
2521 break;
2522 single_shifting = 0;
2523 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_8BIT;
2524 found |= CATEGORY_MASK_ISO_ELSE;
2525 break;
2526
2527 case ISO_CODE_CSI:
2528 /* Control sequence introducer. */
2529 single_shifting = 0;
2530 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2531 found |= CATEGORY_MASK_ISO_8_ELSE;
2532 goto check_extra_latin;
2533
2534
2535 case ISO_CODE_SS2:
2536 case ISO_CODE_SS3:
2537 /* Single shift. */
2538 if (inhibit_iso_escape_detection)
2539 break;
2540 single_shifting = 1;
2541 rejected |= CATEGORY_MASK_ISO_7BIT;
2542 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2543 & CODING_ISO_FLAG_SINGLE_SHIFT)
2544 found |= CATEGORY_MASK_ISO_8_1;
2545 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2546 & CODING_ISO_FLAG_SINGLE_SHIFT)
2547 found |= CATEGORY_MASK_ISO_8_2;
2548 goto check_extra_latin;
2549
2550 default:
2551 if (c < 0x80)
2552 {
2553 single_shifting = 0;
2554 break;
2555 }
2556 if (c >= 0xA0)
2557 {
2558 rejected |= CATEGORY_MASK_ISO_7BIT | CATEGORY_MASK_ISO_7_ELSE;
2559 found |= CATEGORY_MASK_ISO_8_1;
2560 /* Check the length of succeeding codes of the range
2561 0xA0..0FF. If the byte length is even, we include
2562 CATEGORY_MASK_ISO_8_2 in `found'. We can check this
2563 only when we are not single shifting. */
2564 if (! single_shifting
2565 && ! (rejected & CATEGORY_MASK_ISO_8_2))
2566 {
2567 int i = 1;
2568 while (src < src_end)
2569 {
2570 ONE_MORE_BYTE (c);
2571 if (c < 0xA0)
2572 break;
2573 i++;
2574 }
2575
2576 if (i & 1 && src < src_end)
2577 rejected |= CATEGORY_MASK_ISO_8_2;
2578 else
2579 found |= CATEGORY_MASK_ISO_8_2;
2580 }
2581 break;
2582 }
2583 check_extra_latin:
2584 single_shifting = 0;
2585 if (! VECTORP (Vlatin_extra_code_table)
2586 || NILP (XVECTOR (Vlatin_extra_code_table)->contents[c]))
2587 {
2588 rejected = CATEGORY_MASK_ISO;
2589 break;
2590 }
2591 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_1])
2592 & CODING_ISO_FLAG_LATIN_EXTRA)
2593 found |= CATEGORY_MASK_ISO_8_1;
2594 else
2595 rejected |= CATEGORY_MASK_ISO_8_1;
2596 if (CODING_ISO_FLAGS (&coding_categories[coding_category_iso_8_2])
2597 & CODING_ISO_FLAG_LATIN_EXTRA)
2598 found |= CATEGORY_MASK_ISO_8_2;
2599 else
2600 rejected |= CATEGORY_MASK_ISO_8_2;
2601 }
2602 }
2603 detect_info->rejected |= CATEGORY_MASK_ISO;
2604 return 0;
2605
2606 no_more_source:
2607 detect_info->rejected |= rejected;
2608 detect_info->found |= (found & ~rejected);
2609 return 1;
2610 }
2611
2612
2613 /* Set designation state into CODING. */
2614 #define DECODE_DESIGNATION(reg, dim, chars_96, final) \
2615 do { \
2616 int id, prev; \
2617 \
2618 if (final < '0' || final >= 128 \
2619 || ((id = ISO_CHARSET_TABLE (dim, chars_96, final)) < 0) \
2620 || !SAFE_CHARSET_P (coding, id)) \
2621 { \
2622 CODING_ISO_DESIGNATION (coding, reg) = -2; \
2623 goto invalid_code; \
2624 } \
2625 prev = CODING_ISO_DESIGNATION (coding, reg); \
2626 if (id == charset_jisx0201_roman) \
2627 { \
2628 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
2629 id = charset_ascii; \
2630 } \
2631 else if (id == charset_jisx0208_1978) \
2632 { \
2633 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
2634 id = charset_jisx0208; \
2635 } \
2636 CODING_ISO_DESIGNATION (coding, reg) = id; \
2637 /* If there was an invalid designation to REG previously, and this \
2638 designation is ASCII to REG, we should keep this designation \
2639 sequence. */ \
2640 if (prev == -2 && id == charset_ascii) \
2641 goto invalid_code; \
2642 } while (0)
2643
2644
2645 #define MAYBE_FINISH_COMPOSITION() \
2646 do { \
2647 int i; \
2648 if (composition_state == COMPOSING_NO) \
2649 break; \
2650 /* It is assured that we have enough room for producing \
2651 characters stored in the table `components'. */ \
2652 if (charbuf + component_idx > charbuf_end) \
2653 goto no_more_source; \
2654 composition_state = COMPOSING_NO; \
2655 if (method == COMPOSITION_RELATIVE \
2656 || method == COMPOSITION_WITH_ALTCHARS) \
2657 { \
2658 for (i = 0; i < component_idx; i++) \
2659 *charbuf++ = components[i]; \
2660 char_offset += component_idx; \
2661 } \
2662 else \
2663 { \
2664 for (i = 0; i < component_idx; i += 2) \
2665 *charbuf++ = components[i]; \
2666 char_offset += (component_idx / 2) + 1; \
2667 } \
2668 } while (0)
2669
2670
2671 /* Handle composition start sequence ESC 0, ESC 2, ESC 3, or ESC 4.
2672 ESC 0 : relative composition : ESC 0 CHAR ... ESC 1
2673 ESC 2 : rulebase composition : ESC 2 CHAR RULE CHAR RULE ... CHAR ESC 1
2674 ESC 3 : altchar composition : ESC 3 CHAR ... ESC 0 CHAR ... ESC 1
2675 ESC 4 : alt&rule composition : ESC 4 CHAR RULE ... CHAR ESC 0 CHAR ... ESC 1
2676 */
2677
2678 #define DECODE_COMPOSITION_START(c1) \
2679 do { \
2680 if (c1 == '0' \
2681 && composition_state == COMPOSING_COMPONENT_RULE) \
2682 { \
2683 component_len = component_idx; \
2684 composition_state = COMPOSING_CHAR; \
2685 } \
2686 else \
2687 { \
2688 unsigned char *p; \
2689 \
2690 MAYBE_FINISH_COMPOSITION (); \
2691 if (charbuf + MAX_COMPOSITION_COMPONENTS > charbuf_end) \
2692 goto no_more_source; \
2693 for (p = src; p < src_end - 1; p++) \
2694 if (*p == ISO_CODE_ESC && p[1] == '1') \
2695 break; \
2696 if (p == src_end - 1) \
2697 { \
2698 if (coding->mode & CODING_MODE_LAST_BLOCK) \
2699 goto invalid_code; \
2700 goto no_more_source; \
2701 } \
2702 \
2703 /* This is surely the start of a composition. */ \
2704 method = (c1 == '0' ? COMPOSITION_RELATIVE \
2705 : c1 == '2' ? COMPOSITION_WITH_RULE \
2706 : c1 == '3' ? COMPOSITION_WITH_ALTCHARS \
2707 : COMPOSITION_WITH_RULE_ALTCHARS); \
2708 composition_state = (c1 <= '2' ? COMPOSING_CHAR \
2709 : COMPOSING_COMPONENT_CHAR); \
2710 component_idx = component_len = 0; \
2711 } \
2712 } while (0)
2713
2714
2715 /* Handle compositoin end sequence ESC 1. */
2716
2717 #define DECODE_COMPOSITION_END() \
2718 do { \
2719 int nchars = (component_len > 0 ? component_idx - component_len \
2720 : method == COMPOSITION_RELATIVE ? component_idx \
2721 : (component_idx + 1) / 2); \
2722 int i; \
2723 int *saved_charbuf = charbuf; \
2724 int from = coding->produced_char + char_offset; \
2725 int to = from + nchars; \
2726 \
2727 ADD_COMPOSITION_DATA (charbuf, from, to, method); \
2728 if (method != COMPOSITION_RELATIVE) \
2729 { \
2730 if (component_len == 0) \
2731 for (i = 0; i < component_idx; i++) \
2732 *charbuf++ = components[i]; \
2733 else \
2734 for (i = 0; i < component_len; i++) \
2735 *charbuf++ = components[i]; \
2736 *saved_charbuf = saved_charbuf - charbuf; \
2737 } \
2738 if (method == COMPOSITION_WITH_RULE) \
2739 for (i = 0; i < component_idx; i += 2, char_offset++) \
2740 *charbuf++ = components[i]; \
2741 else \
2742 for (i = component_len; i < component_idx; i++, char_offset++) \
2743 *charbuf++ = components[i]; \
2744 coding->annotated = 1; \
2745 composition_state = COMPOSING_NO; \
2746 } while (0)
2747
2748
2749 /* Decode a composition rule from the byte C1 (and maybe one more byte
2750 from SRC) and store one encoded composition rule in
2751 coding->cmp_data. */
2752
2753 #define DECODE_COMPOSITION_RULE(c1) \
2754 do { \
2755 (c1) -= 32; \
2756 if (c1 < 81) /* old format (before ver.21) */ \
2757 { \
2758 int gref = (c1) / 9; \
2759 int nref = (c1) % 9; \
2760 if (gref == 4) gref = 10; \
2761 if (nref == 4) nref = 10; \
2762 c1 = COMPOSITION_ENCODE_RULE (gref, nref); \
2763 } \
2764 else if (c1 < 93) /* new format (after ver.21) */ \
2765 { \
2766 ONE_MORE_BYTE (c2); \
2767 c1 = COMPOSITION_ENCODE_RULE (c1 - 81, c2 - 32); \
2768 } \
2769 else \
2770 c1 = 0; \
2771 } while (0)
2772
2773
2774 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
2775
2776 static void
2777 decode_coding_iso_2022 (coding)
2778 struct coding_system *coding;
2779 {
2780 unsigned char *src = coding->source + coding->consumed;
2781 unsigned char *src_end = coding->source + coding->src_bytes;
2782 unsigned char *src_base;
2783 int *charbuf = coding->charbuf;
2784 int *charbuf_end
2785 = charbuf + coding->charbuf_size - 4 - MAX_ANNOTATION_LENGTH;
2786 int consumed_chars = 0, consumed_chars_base;
2787 int multibytep = coding->src_multibyte;
2788 /* Charsets invoked to graphic plane 0 and 1 respectively. */
2789 int charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2790 int charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2791 struct charset *charset;
2792 int c;
2793 /* For handling composition sequence. */
2794 #define COMPOSING_NO 0
2795 #define COMPOSING_CHAR 1
2796 #define COMPOSING_RULE 2
2797 #define COMPOSING_COMPONENT_CHAR 3
2798 #define COMPOSING_COMPONENT_RULE 4
2799
2800 int composition_state = COMPOSING_NO;
2801 enum composition_method method;
2802 int components[MAX_COMPOSITION_COMPONENTS * 2 + 1];
2803 int component_idx;
2804 int component_len;
2805 Lisp_Object attrs, eol_type, charset_list;
2806 int char_offset = coding->produced_char;
2807 int last_offset = char_offset;
2808 int last_id = charset_ascii;
2809
2810 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
2811 setup_iso_safe_charsets (attrs);
2812
2813 while (1)
2814 {
2815 int c1, c2;
2816
2817 src_base = src;
2818 consumed_chars_base = consumed_chars;
2819
2820 if (charbuf >= charbuf_end)
2821 break;
2822
2823 ONE_MORE_BYTE (c1);
2824
2825 /* We produce at most one character. */
2826 switch (iso_code_class [c1])
2827 {
2828 case ISO_0x20_or_0x7F:
2829 if (composition_state != COMPOSING_NO)
2830 {
2831 if (composition_state == COMPOSING_RULE
2832 || composition_state == COMPOSING_COMPONENT_RULE)
2833 {
2834 DECODE_COMPOSITION_RULE (c1);
2835 components[component_idx++] = c1;
2836 composition_state--;
2837 continue;
2838 }
2839 }
2840 if (charset_id_0 < 0
2841 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_0)))
2842 /* This is SPACE or DEL. */
2843 charset = CHARSET_FROM_ID (charset_ascii);
2844 else
2845 charset = CHARSET_FROM_ID (charset_id_0);
2846 break;
2847
2848 case ISO_graphic_plane_0:
2849 if (composition_state != COMPOSING_NO)
2850 {
2851 if (composition_state == COMPOSING_RULE
2852 || composition_state == COMPOSING_COMPONENT_RULE)
2853 {
2854 DECODE_COMPOSITION_RULE (c1);
2855 components[component_idx++] = c1;
2856 composition_state--;
2857 continue;
2858 }
2859 }
2860 charset = CHARSET_FROM_ID (charset_id_0);
2861 break;
2862
2863 case ISO_0xA0_or_0xFF:
2864 if (charset_id_1 < 0
2865 || ! CHARSET_ISO_CHARS_96 (CHARSET_FROM_ID (charset_id_1))
2866 || CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS)
2867 goto invalid_code;
2868 /* This is a graphic character, we fall down ... */
2869
2870 case ISO_graphic_plane_1:
2871 if (charset_id_1 < 0)
2872 goto invalid_code;
2873 charset = CHARSET_FROM_ID (charset_id_1);
2874 break;
2875
2876 case ISO_carriage_return:
2877 if (c1 == '\r')
2878 {
2879 if (EQ (eol_type, Qdos))
2880 {
2881 if (src == src_end)
2882 {
2883 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
2884 goto no_more_source;
2885 }
2886 if (*src == '\n')
2887 ONE_MORE_BYTE (c1);
2888 }
2889 else if (EQ (eol_type, Qmac))
2890 c1 = '\n';
2891 }
2892 /* fall through */
2893
2894 case ISO_control_0:
2895 MAYBE_FINISH_COMPOSITION ();
2896 charset = CHARSET_FROM_ID (charset_ascii);
2897 break;
2898
2899 case ISO_control_1:
2900 MAYBE_FINISH_COMPOSITION ();
2901 goto invalid_code;
2902
2903 case ISO_shift_out:
2904 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2905 || CODING_ISO_DESIGNATION (coding, 1) < 0)
2906 goto invalid_code;
2907 CODING_ISO_INVOCATION (coding, 0) = 1;
2908 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2909 continue;
2910
2911 case ISO_shift_in:
2912 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT))
2913 goto invalid_code;
2914 CODING_ISO_INVOCATION (coding, 0) = 0;
2915 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2916 continue;
2917
2918 case ISO_single_shift_2_7:
2919 case ISO_single_shift_2:
2920 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2921 goto invalid_code;
2922 /* SS2 is handled as an escape sequence of ESC 'N' */
2923 c1 = 'N';
2924 goto label_escape_sequence;
2925
2926 case ISO_single_shift_3:
2927 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT))
2928 goto invalid_code;
2929 /* SS2 is handled as an escape sequence of ESC 'O' */
2930 c1 = 'O';
2931 goto label_escape_sequence;
2932
2933 case ISO_control_sequence_introducer:
2934 /* CSI is handled as an escape sequence of ESC '[' ... */
2935 c1 = '[';
2936 goto label_escape_sequence;
2937
2938 case ISO_escape:
2939 ONE_MORE_BYTE (c1);
2940 label_escape_sequence:
2941 /* Escape sequences handled here are invocation,
2942 designation, direction specification, and character
2943 composition specification. */
2944 switch (c1)
2945 {
2946 case '&': /* revision of following character set */
2947 ONE_MORE_BYTE (c1);
2948 if (!(c1 >= '@' && c1 <= '~'))
2949 goto invalid_code;
2950 ONE_MORE_BYTE (c1);
2951 if (c1 != ISO_CODE_ESC)
2952 goto invalid_code;
2953 ONE_MORE_BYTE (c1);
2954 goto label_escape_sequence;
2955
2956 case '$': /* designation of 2-byte character set */
2957 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
2958 goto invalid_code;
2959 ONE_MORE_BYTE (c1);
2960 if (c1 >= '@' && c1 <= 'B')
2961 { /* designation of JISX0208.1978, GB2312.1980,
2962 or JISX0208.1980 */
2963 DECODE_DESIGNATION (0, 2, 0, c1);
2964 }
2965 else if (c1 >= 0x28 && c1 <= 0x2B)
2966 { /* designation of DIMENSION2_CHARS94 character set */
2967 ONE_MORE_BYTE (c2);
2968 DECODE_DESIGNATION (c1 - 0x28, 2, 0, c2);
2969 }
2970 else if (c1 >= 0x2C && c1 <= 0x2F)
2971 { /* designation of DIMENSION2_CHARS96 character set */
2972 ONE_MORE_BYTE (c2);
2973 DECODE_DESIGNATION (c1 - 0x2C, 2, 1, c2);
2974 }
2975 else
2976 goto invalid_code;
2977 /* We must update these variables now. */
2978 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2979 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
2980 continue;
2981
2982 case 'n': /* invocation of locking-shift-2 */
2983 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2984 || CODING_ISO_DESIGNATION (coding, 2) < 0)
2985 goto invalid_code;
2986 CODING_ISO_INVOCATION (coding, 0) = 2;
2987 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2988 continue;
2989
2990 case 'o': /* invocation of locking-shift-3 */
2991 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LOCKING_SHIFT)
2992 || CODING_ISO_DESIGNATION (coding, 3) < 0)
2993 goto invalid_code;
2994 CODING_ISO_INVOCATION (coding, 0) = 3;
2995 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
2996 continue;
2997
2998 case 'N': /* invocation of single-shift-2 */
2999 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3000 || CODING_ISO_DESIGNATION (coding, 2) < 0)
3001 goto invalid_code;
3002 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 2));
3003 ONE_MORE_BYTE (c1);
3004 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3005 goto invalid_code;
3006 break;
3007
3008 case 'O': /* invocation of single-shift-3 */
3009 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3010 || CODING_ISO_DESIGNATION (coding, 3) < 0)
3011 goto invalid_code;
3012 charset = CHARSET_FROM_ID (CODING_ISO_DESIGNATION (coding, 3));
3013 ONE_MORE_BYTE (c1);
3014 if (c1 < 0x20 || (c1 >= 0x80 && c1 < 0xA0))
3015 goto invalid_code;
3016 break;
3017
3018 case '0': case '2': case '3': case '4': /* start composition */
3019 if (! (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK))
3020 goto invalid_code;
3021 DECODE_COMPOSITION_START (c1);
3022 continue;
3023
3024 case '1': /* end composition */
3025 if (composition_state == COMPOSING_NO)
3026 goto invalid_code;
3027 DECODE_COMPOSITION_END ();
3028 continue;
3029
3030 case '[': /* specification of direction */
3031 if (! CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DIRECTION)
3032 goto invalid_code;
3033 /* For the moment, nested direction is not supported.
3034 So, `coding->mode & CODING_MODE_DIRECTION' zero means
3035 left-to-right, and nozero means right-to-left. */
3036 ONE_MORE_BYTE (c1);
3037 switch (c1)
3038 {
3039 case ']': /* end of the current direction */
3040 coding->mode &= ~CODING_MODE_DIRECTION;
3041
3042 case '0': /* end of the current direction */
3043 case '1': /* start of left-to-right direction */
3044 ONE_MORE_BYTE (c1);
3045 if (c1 == ']')
3046 coding->mode &= ~CODING_MODE_DIRECTION;
3047 else
3048 goto invalid_code;
3049 break;
3050
3051 case '2': /* start of right-to-left direction */
3052 ONE_MORE_BYTE (c1);
3053 if (c1 == ']')
3054 coding->mode |= CODING_MODE_DIRECTION;
3055 else
3056 goto invalid_code;
3057 break;
3058
3059 default:
3060 goto invalid_code;
3061 }
3062 continue;
3063
3064 case '%':
3065 ONE_MORE_BYTE (c1);
3066 if (c1 == '/')
3067 {
3068 /* CTEXT extended segment:
3069 ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES--
3070 We keep these bytes as is for the moment.
3071 They may be decoded by post-read-conversion. */
3072 int dim, M, L;
3073 int size;
3074
3075 ONE_MORE_BYTE (dim);
3076 ONE_MORE_BYTE (M);
3077 ONE_MORE_BYTE (L);
3078 size = ((M - 128) * 128) + (L - 128);
3079 if (charbuf + 8 + size > charbuf_end)
3080 goto break_loop;
3081 *charbuf++ = ISO_CODE_ESC;
3082 *charbuf++ = '%';
3083 *charbuf++ = '/';
3084 *charbuf++ = dim;
3085 *charbuf++ = BYTE8_TO_CHAR (M);
3086 *charbuf++ = BYTE8_TO_CHAR (L);
3087 while (size-- > 0)
3088 {
3089 ONE_MORE_BYTE (c1);
3090 *charbuf++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3091 }
3092 }
3093 else if (c1 == 'G')
3094 {
3095 /* XFree86 extension for embedding UTF-8 in CTEXT:
3096 ESC % G --UTF-8-BYTES-- ESC % @
3097 We keep these bytes as is for the moment.
3098 They may be decoded by post-read-conversion. */
3099 int *p = charbuf;
3100
3101 if (p + 6 > charbuf_end)
3102 goto break_loop;
3103 *p++ = ISO_CODE_ESC;
3104 *p++ = '%';
3105 *p++ = 'G';
3106 while (p < charbuf_end)
3107 {
3108 ONE_MORE_BYTE (c1);
3109 if (c1 == ISO_CODE_ESC
3110 && src + 1 < src_end
3111 && src[0] == '%'
3112 && src[1] == '@')
3113 break;
3114 *p++ = ASCII_BYTE_P (c1) ? c1 : BYTE8_TO_CHAR (c1);
3115 }
3116 if (p + 3 > charbuf_end)
3117 goto break_loop;
3118 *p++ = ISO_CODE_ESC;
3119 *p++ = '%';
3120 *p++ = '@';
3121 charbuf = p;
3122 }
3123 else
3124 goto invalid_code;
3125 continue;
3126 break;
3127
3128 default:
3129 if (! (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATION))
3130 goto invalid_code;
3131 if (c1 >= 0x28 && c1 <= 0x2B)
3132 { /* designation of DIMENSION1_CHARS94 character set */
3133 ONE_MORE_BYTE (c2);
3134 DECODE_DESIGNATION (c1 - 0x28, 1, 0, c2);
3135 }
3136 else if (c1 >= 0x2C && c1 <= 0x2F)
3137 { /* designation of DIMENSION1_CHARS96 character set */
3138 ONE_MORE_BYTE (c2);
3139 DECODE_DESIGNATION (c1 - 0x2C, 1, 1, c2);
3140 }
3141 else
3142 goto invalid_code;
3143 /* We must update these variables now. */
3144 charset_id_0 = CODING_ISO_INVOKED_CHARSET (coding, 0);
3145 charset_id_1 = CODING_ISO_INVOKED_CHARSET (coding, 1);
3146 continue;
3147 }
3148 }
3149
3150 if (charset->id != charset_ascii
3151 && last_id != charset->id)
3152 {
3153 if (last_id != charset_ascii)
3154 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3155 last_id = charset->id;
3156 last_offset = char_offset;
3157 }
3158
3159 /* Now we know CHARSET and 1st position code C1 of a character.
3160 Produce a decoded character while getting 2nd position code
3161 C2 if necessary. */
3162 c1 &= 0x7F;
3163 if (CHARSET_DIMENSION (charset) > 1)
3164 {
3165 ONE_MORE_BYTE (c2);
3166 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3167 /* C2 is not in a valid range. */
3168 goto invalid_code;
3169 c1 = (c1 << 8) | (c2 & 0x7F);
3170 if (CHARSET_DIMENSION (charset) > 2)
3171 {
3172 ONE_MORE_BYTE (c2);
3173 if (c2 < 0x20 || (c2 >= 0x80 && c2 < 0xA0))
3174 /* C2 is not in a valid range. */
3175 goto invalid_code;
3176 c1 = (c1 << 8) | (c2 & 0x7F);
3177 }
3178 }
3179
3180 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c1, c);
3181 if (c < 0)
3182 {
3183 MAYBE_FINISH_COMPOSITION ();
3184 for (; src_base < src; src_base++, char_offset++)
3185 {
3186 if (ASCII_BYTE_P (*src_base))
3187 *charbuf++ = *src_base;
3188 else
3189 *charbuf++ = BYTE8_TO_CHAR (*src_base);
3190 }
3191 }
3192 else if (composition_state == COMPOSING_NO)
3193 {
3194 *charbuf++ = c;
3195 char_offset++;
3196 }
3197 else
3198 {
3199 components[component_idx++] = c;
3200 if (method == COMPOSITION_WITH_RULE
3201 || (method == COMPOSITION_WITH_RULE_ALTCHARS
3202 && composition_state == COMPOSING_COMPONENT_CHAR))
3203 composition_state++;
3204 }
3205 continue;
3206
3207 invalid_code:
3208 MAYBE_FINISH_COMPOSITION ();
3209 src = src_base;
3210 consumed_chars = consumed_chars_base;
3211 ONE_MORE_BYTE (c);
3212 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
3213 char_offset++;
3214 coding->errors++;
3215 continue;
3216
3217 break_loop:
3218 break;
3219 }
3220
3221 no_more_source:
3222 if (last_id != charset_ascii)
3223 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
3224 coding->consumed_char += consumed_chars_base;
3225 coding->consumed = src_base - coding->source;
3226 coding->charbuf_used = charbuf - coding->charbuf;
3227 }
3228
3229
3230 /* ISO2022 encoding stuff. */
3231
3232 /*
3233 It is not enough to say just "ISO2022" on encoding, we have to
3234 specify more details. In Emacs, each coding system of ISO2022
3235 variant has the following specifications:
3236 1. Initial designation to G0 thru G3.
3237 2. Allows short-form designation?
3238 3. ASCII should be designated to G0 before control characters?
3239 4. ASCII should be designated to G0 at end of line?
3240 5. 7-bit environment or 8-bit environment?
3241 6. Use locking-shift?
3242 7. Use Single-shift?
3243 And the following two are only for Japanese:
3244 8. Use ASCII in place of JIS0201-1976-Roman?
3245 9. Use JISX0208-1983 in place of JISX0208-1978?
3246 These specifications are encoded in CODING_ISO_FLAGS (coding) as flag bits
3247 defined by macros CODING_ISO_FLAG_XXX. See `coding.h' for more
3248 details.
3249 */
3250
3251 /* Produce codes (escape sequence) for designating CHARSET to graphic
3252 register REG at DST, and increment DST. If <final-char> of CHARSET is
3253 '@', 'A', or 'B' and the coding system CODING allows, produce
3254 designation sequence of short-form. */
3255
3256 #define ENCODE_DESIGNATION(charset, reg, coding) \
3257 do { \
3258 unsigned char final_char = CHARSET_ISO_FINAL (charset); \
3259 char *intermediate_char_94 = "()*+"; \
3260 char *intermediate_char_96 = ",-./"; \
3261 int revision = -1; \
3262 int c; \
3263 \
3264 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_REVISION) \
3265 revision = CHARSET_ISO_REVISION (charset); \
3266 \
3267 if (revision >= 0) \
3268 { \
3269 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '&'); \
3270 EMIT_ONE_BYTE ('@' + revision); \
3271 } \
3272 EMIT_ONE_ASCII_BYTE (ISO_CODE_ESC); \
3273 if (CHARSET_DIMENSION (charset) == 1) \
3274 { \
3275 if (! CHARSET_ISO_CHARS_96 (charset)) \
3276 c = intermediate_char_94[reg]; \
3277 else \
3278 c = intermediate_char_96[reg]; \
3279 EMIT_ONE_ASCII_BYTE (c); \
3280 } \
3281 else \
3282 { \
3283 EMIT_ONE_ASCII_BYTE ('$'); \
3284 if (! CHARSET_ISO_CHARS_96 (charset)) \
3285 { \
3286 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_LONG_FORM \
3287 || reg != 0 \
3288 || final_char < '@' || final_char > 'B') \
3289 EMIT_ONE_ASCII_BYTE (intermediate_char_94[reg]); \
3290 } \
3291 else \
3292 EMIT_ONE_ASCII_BYTE (intermediate_char_96[reg]); \
3293 } \
3294 EMIT_ONE_ASCII_BYTE (final_char); \
3295 \
3296 CODING_ISO_DESIGNATION (coding, reg) = CHARSET_ID (charset); \
3297 } while (0)
3298
3299
3300 /* The following two macros produce codes (control character or escape
3301 sequence) for ISO2022 single-shift functions (single-shift-2 and
3302 single-shift-3). */
3303
3304 #define ENCODE_SINGLE_SHIFT_2 \
3305 do { \
3306 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3307 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'N'); \
3308 else \
3309 EMIT_ONE_BYTE (ISO_CODE_SS2); \
3310 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3311 } while (0)
3312
3313
3314 #define ENCODE_SINGLE_SHIFT_3 \
3315 do { \
3316 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3317 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'O'); \
3318 else \
3319 EMIT_ONE_BYTE (ISO_CODE_SS3); \
3320 CODING_ISO_SINGLE_SHIFTING (coding) = 1; \
3321 } while (0)
3322
3323
3324 /* The following four macros produce codes (control character or
3325 escape sequence) for ISO2022 locking-shift functions (shift-in,
3326 shift-out, locking-shift-2, and locking-shift-3). */
3327
3328 #define ENCODE_SHIFT_IN \
3329 do { \
3330 EMIT_ONE_ASCII_BYTE (ISO_CODE_SI); \
3331 CODING_ISO_INVOCATION (coding, 0) = 0; \
3332 } while (0)
3333
3334
3335 #define ENCODE_SHIFT_OUT \
3336 do { \
3337 EMIT_ONE_ASCII_BYTE (ISO_CODE_SO); \
3338 CODING_ISO_INVOCATION (coding, 0) = 1; \
3339 } while (0)
3340
3341
3342 #define ENCODE_LOCKING_SHIFT_2 \
3343 do { \
3344 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3345 CODING_ISO_INVOCATION (coding, 0) = 2; \
3346 } while (0)
3347
3348
3349 #define ENCODE_LOCKING_SHIFT_3 \
3350 do { \
3351 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, 'n'); \
3352 CODING_ISO_INVOCATION (coding, 0) = 3; \
3353 } while (0)
3354
3355
3356 /* Produce codes for a DIMENSION1 character whose character set is
3357 CHARSET and whose position-code is C1. Designation and invocation
3358 sequences are also produced in advance if necessary. */
3359
3360 #define ENCODE_ISO_CHARACTER_DIMENSION1(charset, c1) \
3361 do { \
3362 int id = CHARSET_ID (charset); \
3363 \
3364 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_ROMAN) \
3365 && id == charset_ascii) \
3366 { \
3367 id = charset_jisx0201_roman; \
3368 charset = CHARSET_FROM_ID (id); \
3369 } \
3370 \
3371 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3372 { \
3373 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3374 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3375 else \
3376 EMIT_ONE_BYTE (c1 | 0x80); \
3377 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3378 break; \
3379 } \
3380 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3381 { \
3382 EMIT_ONE_ASCII_BYTE (c1 & 0x7F); \
3383 break; \
3384 } \
3385 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3386 { \
3387 EMIT_ONE_BYTE (c1 | 0x80); \
3388 break; \
3389 } \
3390 else \
3391 /* Since CHARSET is not yet invoked to any graphic planes, we \
3392 must invoke it, or, at first, designate it to some graphic \
3393 register. Then repeat the loop to actually produce the \
3394 character. */ \
3395 dst = encode_invocation_designation (charset, coding, dst, \
3396 &produced_chars); \
3397 } while (1)
3398
3399
3400 /* Produce codes for a DIMENSION2 character whose character set is
3401 CHARSET and whose position-codes are C1 and C2. Designation and
3402 invocation codes are also produced in advance if necessary. */
3403
3404 #define ENCODE_ISO_CHARACTER_DIMENSION2(charset, c1, c2) \
3405 do { \
3406 int id = CHARSET_ID (charset); \
3407 \
3408 if ((CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_USE_OLDJIS) \
3409 && id == charset_jisx0208) \
3410 { \
3411 id = charset_jisx0208_1978; \
3412 charset = CHARSET_FROM_ID (id); \
3413 } \
3414 \
3415 if (CODING_ISO_SINGLE_SHIFTING (coding)) \
3416 { \
3417 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SEVEN_BITS) \
3418 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3419 else \
3420 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3421 CODING_ISO_SINGLE_SHIFTING (coding) = 0; \
3422 break; \
3423 } \
3424 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 0)) \
3425 { \
3426 EMIT_TWO_ASCII_BYTES ((c1) & 0x7F, (c2) & 0x7F); \
3427 break; \
3428 } \
3429 else if (id == CODING_ISO_INVOKED_CHARSET (coding, 1)) \
3430 { \
3431 EMIT_TWO_BYTES ((c1) | 0x80, (c2) | 0x80); \
3432 break; \
3433 } \
3434 else \
3435 /* Since CHARSET is not yet invoked to any graphic planes, we \
3436 must invoke it, or, at first, designate it to some graphic \
3437 register. Then repeat the loop to actually produce the \
3438 character. */ \
3439 dst = encode_invocation_designation (charset, coding, dst, \
3440 &produced_chars); \
3441 } while (1)
3442
3443
3444 #define ENCODE_ISO_CHARACTER(charset, c) \
3445 do { \
3446 int code = ENCODE_CHAR ((charset),(c)); \
3447 \
3448 if (CHARSET_DIMENSION (charset) == 1) \
3449 ENCODE_ISO_CHARACTER_DIMENSION1 ((charset), code); \
3450 else \
3451 ENCODE_ISO_CHARACTER_DIMENSION2 ((charset), code >> 8, code & 0xFF); \
3452 } while (0)
3453
3454
3455 /* Produce designation and invocation codes at a place pointed by DST
3456 to use CHARSET. The element `spec.iso_2022' of *CODING is updated.
3457 Return new DST. */
3458
3459 unsigned char *
3460 encode_invocation_designation (charset, coding, dst, p_nchars)
3461 struct charset *charset;
3462 struct coding_system *coding;
3463 unsigned char *dst;
3464 int *p_nchars;
3465 {
3466 int multibytep = coding->dst_multibyte;
3467 int produced_chars = *p_nchars;
3468 int reg; /* graphic register number */
3469 int id = CHARSET_ID (charset);
3470
3471 /* At first, check designations. */
3472 for (reg = 0; reg < 4; reg++)
3473 if (id == CODING_ISO_DESIGNATION (coding, reg))
3474 break;
3475
3476 if (reg >= 4)
3477 {
3478 /* CHARSET is not yet designated to any graphic registers. */
3479 /* At first check the requested designation. */
3480 reg = CODING_ISO_REQUEST (coding, id);
3481 if (reg < 0)
3482 /* Since CHARSET requests no special designation, designate it
3483 to graphic register 0. */
3484 reg = 0;
3485
3486 ENCODE_DESIGNATION (charset, reg, coding);
3487 }
3488
3489 if (CODING_ISO_INVOCATION (coding, 0) != reg
3490 && CODING_ISO_INVOCATION (coding, 1) != reg)
3491 {
3492 /* Since the graphic register REG is not invoked to any graphic
3493 planes, invoke it to graphic plane 0. */
3494 switch (reg)
3495 {
3496 case 0: /* graphic register 0 */
3497 ENCODE_SHIFT_IN;
3498 break;
3499
3500 case 1: /* graphic register 1 */
3501 ENCODE_SHIFT_OUT;
3502 break;
3503
3504 case 2: /* graphic register 2 */
3505 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3506 ENCODE_SINGLE_SHIFT_2;
3507 else
3508 ENCODE_LOCKING_SHIFT_2;
3509 break;
3510
3511 case 3: /* graphic register 3 */
3512 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_SINGLE_SHIFT)
3513 ENCODE_SINGLE_SHIFT_3;
3514 else
3515 ENCODE_LOCKING_SHIFT_3;
3516 break;
3517 }
3518 }
3519
3520 *p_nchars = produced_chars;
3521 return dst;
3522 }
3523
3524 /* The following three macros produce codes for indicating direction
3525 of text. */
3526 #define ENCODE_CONTROL_SEQUENCE_INTRODUCER \
3527 do { \
3528 if (CODING_ISO_FLAGS (coding) == CODING_ISO_FLAG_SEVEN_BITS) \
3529 EMIT_TWO_ASCII_BYTES (ISO_CODE_ESC, '['); \
3530 else \
3531 EMIT_ONE_BYTE (ISO_CODE_CSI); \
3532 } while (0)
3533
3534
3535 #define ENCODE_DIRECTION_R2L() \
3536 do { \
3537 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3538 EMIT_TWO_ASCII_BYTES ('2', ']'); \
3539 } while (0)
3540
3541
3542 #define ENCODE_DIRECTION_L2R() \
3543 do { \
3544 ENCODE_CONTROL_SEQUENCE_INTRODUCER (dst); \
3545 EMIT_TWO_ASCII_BYTES ('0', ']'); \
3546 } while (0)
3547
3548
3549 /* Produce codes for designation and invocation to reset the graphic
3550 planes and registers to initial state. */
3551 #define ENCODE_RESET_PLANE_AND_REGISTER() \
3552 do { \
3553 int reg; \
3554 struct charset *charset; \
3555 \
3556 if (CODING_ISO_INVOCATION (coding, 0) != 0) \
3557 ENCODE_SHIFT_IN; \
3558 for (reg = 0; reg < 4; reg++) \
3559 if (CODING_ISO_INITIAL (coding, reg) >= 0 \
3560 && (CODING_ISO_DESIGNATION (coding, reg) \
3561 != CODING_ISO_INITIAL (coding, reg))) \
3562 { \
3563 charset = CHARSET_FROM_ID (CODING_ISO_INITIAL (coding, reg)); \
3564 ENCODE_DESIGNATION (charset, reg, coding); \
3565 } \
3566 } while (0)
3567
3568
3569 /* Produce designation sequences of charsets in the line started from
3570 SRC to a place pointed by DST, and return updated DST.
3571
3572 If the current block ends before any end-of-line, we may fail to
3573 find all the necessary designations. */
3574
3575 static unsigned char *
3576 encode_designation_at_bol (coding, charbuf, charbuf_end, dst)
3577 struct coding_system *coding;
3578 int *charbuf, *charbuf_end;
3579 unsigned char *dst;
3580 {
3581 struct charset *charset;
3582 /* Table of charsets to be designated to each graphic register. */
3583 int r[4];
3584 int c, found = 0, reg;
3585 int produced_chars = 0;
3586 int multibytep = coding->dst_multibyte;
3587 Lisp_Object attrs;
3588 Lisp_Object charset_list;
3589
3590 attrs = CODING_ID_ATTRS (coding->id);
3591 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
3592 if (EQ (charset_list, Qiso_2022))
3593 charset_list = Viso_2022_charset_list;
3594
3595 for (reg = 0; reg < 4; reg++)
3596 r[reg] = -1;
3597
3598 while (found < 4)
3599 {
3600 int id;
3601
3602 c = *charbuf++;
3603 if (c == '\n')
3604 break;
3605 charset = char_charset (c, charset_list, NULL);
3606 id = CHARSET_ID (charset);
3607 reg = CODING_ISO_REQUEST (coding, id);
3608 if (reg >= 0 && r[reg] < 0)
3609 {
3610 found++;
3611 r[reg] = id;
3612 }
3613 }
3614
3615 if (found)
3616 {
3617 for (reg = 0; reg < 4; reg++)
3618 if (r[reg] >= 0
3619 && CODING_ISO_DESIGNATION (coding, reg) != r[reg])
3620 ENCODE_DESIGNATION (CHARSET_FROM_ID (r[reg]), reg, coding);
3621 }
3622
3623 return dst;
3624 }
3625
3626 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions". */
3627
3628 static int
3629 encode_coding_iso_2022 (coding)
3630 struct coding_system *coding;
3631 {
3632 int multibytep = coding->dst_multibyte;
3633 int *charbuf = coding->charbuf;
3634 int *charbuf_end = charbuf + coding->charbuf_used;
3635 unsigned char *dst = coding->destination + coding->produced;
3636 unsigned char *dst_end = coding->destination + coding->dst_bytes;
3637 int safe_room = 16;
3638 int bol_designation
3639 = (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL
3640 && CODING_ISO_BOL (coding));
3641 int produced_chars = 0;
3642 Lisp_Object attrs, eol_type, charset_list;
3643 int ascii_compatible;
3644 int c;
3645 int preferred_charset_id = -1;
3646
3647 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3648 setup_iso_safe_charsets (attrs);
3649 /* Charset list may have been changed. */
3650 charset_list = CODING_ATTR_CHARSET_LIST (attrs); \
3651 coding->safe_charsets
3652 = (char *) XSTRING (CODING_ATTR_SAFE_CHARSETS(attrs))->data;
3653
3654 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
3655
3656 while (charbuf < charbuf_end)
3657 {
3658 ASSURE_DESTINATION (safe_room);
3659
3660 if (bol_designation)
3661 {
3662 unsigned char *dst_prev = dst;
3663
3664 /* We have to produce designation sequences if any now. */
3665 dst = encode_designation_at_bol (coding, charbuf, charbuf_end, dst);
3666 bol_designation = 0;
3667 /* We are sure that designation sequences are all ASCII bytes. */
3668 produced_chars += dst - dst_prev;
3669 }
3670
3671 c = *charbuf++;
3672
3673 if (c < 0)
3674 {
3675 /* Handle an annotation. */
3676 switch (*charbuf)
3677 {
3678 case CODING_ANNOTATE_COMPOSITION_MASK:
3679 /* Not yet implemented. */
3680 break;
3681 case CODING_ANNOTATE_CHARSET_MASK:
3682 preferred_charset_id = charbuf[3];
3683 if (preferred_charset_id >= 0
3684 && NILP (Fmemq (make_number (preferred_charset_id),
3685 charset_list)))
3686 preferred_charset_id = -1;
3687 break;
3688 default:
3689 abort ();
3690 }
3691 charbuf += -c - 1;
3692 continue;
3693 }
3694
3695 /* Now encode the character C. */
3696 if (c < 0x20 || c == 0x7F)
3697 {
3698 if (c == '\n'
3699 || (c == '\r' && EQ (eol_type, Qmac)))
3700 {
3701 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3702 ENCODE_RESET_PLANE_AND_REGISTER ();
3703 if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_INIT_AT_BOL)
3704 {
3705 int i;
3706
3707 for (i = 0; i < 4; i++)
3708 CODING_ISO_DESIGNATION (coding, i)
3709 = CODING_ISO_INITIAL (coding, i);
3710 }
3711 bol_designation
3712 = CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_DESIGNATE_AT_BOL;
3713 }
3714 else if (CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_CNTL)
3715 ENCODE_RESET_PLANE_AND_REGISTER ();
3716 EMIT_ONE_ASCII_BYTE (c);
3717 }
3718 else if (ASCII_CHAR_P (c))
3719 {
3720 if (ascii_compatible)
3721 EMIT_ONE_ASCII_BYTE (c);
3722 else
3723 {
3724 struct charset *charset = CHARSET_FROM_ID (charset_ascii);
3725 ENCODE_ISO_CHARACTER (charset, c);
3726 }
3727 }
3728 else if (CHAR_BYTE8_P (c))
3729 {
3730 c = CHAR_TO_BYTE8 (c);
3731 EMIT_ONE_BYTE (c);
3732 }
3733 else
3734 {
3735 struct charset *charset;
3736
3737 if (preferred_charset_id >= 0)
3738 {
3739 charset = CHARSET_FROM_ID (preferred_charset_id);
3740 if (! CHAR_CHARSET_P (c, charset))
3741 charset = char_charset (c, charset_list, NULL);
3742 }
3743 else
3744 charset = char_charset (c, charset_list, NULL);
3745 if (!charset)
3746 {
3747 if (coding->mode & CODING_MODE_SAFE_ENCODING)
3748 {
3749 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
3750 charset = CHARSET_FROM_ID (charset_ascii);
3751 }
3752 else
3753 {
3754 c = coding->default_char;
3755 charset = char_charset (c, charset_list, NULL);
3756 }
3757 }
3758 ENCODE_ISO_CHARACTER (charset, c);
3759 }
3760 }
3761
3762 if (coding->mode & CODING_MODE_LAST_BLOCK
3763 && CODING_ISO_FLAGS (coding) & CODING_ISO_FLAG_RESET_AT_EOL)
3764 {
3765 ASSURE_DESTINATION (safe_room);
3766 ENCODE_RESET_PLANE_AND_REGISTER ();
3767 }
3768 coding->result = CODING_RESULT_SUCCESS;
3769 CODING_ISO_BOL (coding) = bol_designation;
3770 coding->produced_char += produced_chars;
3771 coding->produced = dst - coding->destination;
3772 return 0;
3773 }
3774
3775 \f
3776 /*** 8,9. SJIS and BIG5 handlers ***/
3777
3778 /* Although SJIS and BIG5 are not ISO's coding system, they are used
3779 quite widely. So, for the moment, Emacs supports them in the bare
3780 C code. But, in the future, they may be supported only by CCL. */
3781
3782 /* SJIS is a coding system encoding three character sets: ASCII, right
3783 half of JISX0201-Kana, and JISX0208. An ASCII character is encoded
3784 as is. A character of charset katakana-jisx0201 is encoded by
3785 "position-code + 0x80". A character of charset japanese-jisx0208
3786 is encoded in 2-byte but two position-codes are divided and shifted
3787 so that it fit in the range below.
3788
3789 --- CODE RANGE of SJIS ---
3790 (character set) (range)
3791 ASCII 0x00 .. 0x7F
3792 KATAKANA-JISX0201 0xA0 .. 0xDF
3793 JISX0208 (1st byte) 0x81 .. 0x9F and 0xE0 .. 0xEF
3794 (2nd byte) 0x40 .. 0x7E and 0x80 .. 0xFC
3795 -------------------------------
3796
3797 */
3798
3799 /* BIG5 is a coding system encoding two character sets: ASCII and
3800 Big5. An ASCII character is encoded as is. Big5 is a two-byte
3801 character set and is encoded in two-byte.
3802
3803 --- CODE RANGE of BIG5 ---
3804 (character set) (range)
3805 ASCII 0x00 .. 0x7F
3806 Big5 (1st byte) 0xA1 .. 0xFE
3807 (2nd byte) 0x40 .. 0x7E and 0xA1 .. 0xFE
3808 --------------------------
3809
3810 */
3811
3812 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3813 Check if a text is encoded in SJIS. If it is, return
3814 CATEGORY_MASK_SJIS, else return 0. */
3815
3816 static int
3817 detect_coding_sjis (coding, detect_info)
3818 struct coding_system *coding;
3819 struct coding_detection_info *detect_info;
3820 {
3821 unsigned char *src = coding->source, *src_base = src;
3822 unsigned char *src_end = coding->source + coding->src_bytes;
3823 int multibytep = coding->src_multibyte;
3824 int consumed_chars = 0;
3825 int found = 0;
3826 int c;
3827 int incomplete;
3828
3829 detect_info->checked |= CATEGORY_MASK_SJIS;
3830 /* A coding system of this category is always ASCII compatible. */
3831 src += coding->head_ascii;
3832
3833 while (1)
3834 {
3835 incomplete = 0;
3836 ONE_MORE_BYTE (c);
3837 incomplete = 1;
3838 if (c < 0x80)
3839 continue;
3840 if ((c >= 0x81 && c <= 0x9F) || (c >= 0xE0 && c <= 0xEF))
3841 {
3842 ONE_MORE_BYTE (c);
3843 if (c < 0x40 || c == 0x7F || c > 0xFC)
3844 break;
3845 found = CATEGORY_MASK_SJIS;
3846 }
3847 else if (c >= 0xA0 && c < 0xE0)
3848 found = CATEGORY_MASK_SJIS;
3849 else
3850 break;
3851 }
3852 detect_info->rejected |= CATEGORY_MASK_SJIS;
3853 return 0;
3854
3855 no_more_source:
3856 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3857 {
3858 detect_info->rejected |= CATEGORY_MASK_SJIS;
3859 return 0;
3860 }
3861 detect_info->found |= found;
3862 return 1;
3863 }
3864
3865 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
3866 Check if a text is encoded in BIG5. If it is, return
3867 CATEGORY_MASK_BIG5, else return 0. */
3868
3869 static int
3870 detect_coding_big5 (coding, detect_info)
3871 struct coding_system *coding;
3872 struct coding_detection_info *detect_info;
3873 {
3874 unsigned char *src = coding->source, *src_base = src;
3875 unsigned char *src_end = coding->source + coding->src_bytes;
3876 int multibytep = coding->src_multibyte;
3877 int consumed_chars = 0;
3878 int found = 0;
3879 int c;
3880 int incomplete;
3881
3882 detect_info->checked |= CATEGORY_MASK_BIG5;
3883 /* A coding system of this category is always ASCII compatible. */
3884 src += coding->head_ascii;
3885
3886 while (1)
3887 {
3888 incomplete = 0;
3889 ONE_MORE_BYTE (c);
3890 incomplete = 1;
3891 if (c < 0x80)
3892 continue;
3893 if (c >= 0xA1)
3894 {
3895 ONE_MORE_BYTE (c);
3896 if (c < 0x40 || (c >= 0x7F && c <= 0xA0))
3897 return 0;
3898 found = CATEGORY_MASK_BIG5;
3899 }
3900 else
3901 break;
3902 }
3903 detect_info->rejected |= CATEGORY_MASK_BIG5;
3904 return 0;
3905
3906 no_more_source:
3907 if (incomplete && coding->mode & CODING_MODE_LAST_BLOCK)
3908 {
3909 detect_info->rejected |= CATEGORY_MASK_BIG5;
3910 return 0;
3911 }
3912 detect_info->found |= found;
3913 return 1;
3914 }
3915
3916 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions".
3917 If SJIS_P is 1, decode SJIS text, else decode BIG5 test. */
3918
3919 static void
3920 decode_coding_sjis (coding)
3921 struct coding_system *coding;
3922 {
3923 unsigned char *src = coding->source + coding->consumed;
3924 unsigned char *src_end = coding->source + coding->src_bytes;
3925 unsigned char *src_base;
3926 int *charbuf = coding->charbuf;
3927 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
3928 int consumed_chars = 0, consumed_chars_base;
3929 int multibytep = coding->src_multibyte;
3930 struct charset *charset_roman, *charset_kanji, *charset_kana;
3931 Lisp_Object attrs, eol_type, charset_list, val;
3932 int char_offset = coding->produced_char;
3933 int last_offset = char_offset;
3934 int last_id = charset_ascii;
3935
3936 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
3937
3938 val = charset_list;
3939 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3940 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
3941 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
3942
3943 while (1)
3944 {
3945 int c, c1;
3946
3947 src_base = src;
3948 consumed_chars_base = consumed_chars;
3949
3950 if (charbuf >= charbuf_end)
3951 break;
3952
3953 ONE_MORE_BYTE (c);
3954
3955 if (c == '\r')
3956 {
3957 if (EQ (eol_type, Qdos))
3958 {
3959 if (src == src_end)
3960 {
3961 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
3962 goto no_more_source;
3963 }
3964 if (*src == '\n')
3965 ONE_MORE_BYTE (c);
3966 }
3967 else if (EQ (eol_type, Qmac))
3968 c = '\n';
3969 }
3970 else
3971 {
3972 struct charset *charset;
3973
3974 if (c < 0x80)
3975 charset = charset_roman;
3976 else
3977 {
3978 if (c >= 0xF0)
3979 goto invalid_code;
3980 if (c < 0xA0 || c >= 0xE0)
3981 {
3982 /* SJIS -> JISX0208 */
3983 ONE_MORE_BYTE (c1);
3984 if (c1 < 0x40 || c1 == 0x7F || c1 > 0xFC)
3985 goto invalid_code;
3986 c = (c << 8) | c1;
3987 SJIS_TO_JIS (c);
3988 charset = charset_kanji;
3989 }
3990 else
3991 {
3992 /* SJIS -> JISX0201-Kana */
3993 c &= 0x7F;
3994 charset = charset_kana;
3995 }
3996 }
3997 if (charset->id != charset_ascii
3998 && last_id != charset->id)
3999 {
4000 if (last_id != charset_ascii)
4001 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4002 last_id = charset->id;
4003 last_offset = char_offset;
4004 }
4005 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4006 }
4007 *charbuf++ = c;
4008 char_offset++;
4009 continue;
4010
4011 invalid_code:
4012 src = src_base;
4013 consumed_chars = consumed_chars_base;
4014 ONE_MORE_BYTE (c);
4015 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4016 char_offset++;
4017 coding->errors++;
4018 }
4019
4020 no_more_source:
4021 if (last_id != charset_ascii)
4022 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4023 coding->consumed_char += consumed_chars_base;
4024 coding->consumed = src_base - coding->source;
4025 coding->charbuf_used = charbuf - coding->charbuf;
4026 }
4027
4028 static void
4029 decode_coding_big5 (coding)
4030 struct coding_system *coding;
4031 {
4032 unsigned char *src = coding->source + coding->consumed;
4033 unsigned char *src_end = coding->source + coding->src_bytes;
4034 unsigned char *src_base;
4035 int *charbuf = coding->charbuf;
4036 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4037 int consumed_chars = 0, consumed_chars_base;
4038 int multibytep = coding->src_multibyte;
4039 struct charset *charset_roman, *charset_big5;
4040 Lisp_Object attrs, eol_type, charset_list, val;
4041 int char_offset = coding->produced_char;
4042 int last_offset = char_offset;
4043 int last_id = charset_ascii;
4044
4045 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4046 val = charset_list;
4047 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4048 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4049
4050 while (1)
4051 {
4052 int c, c1;
4053
4054 src_base = src;
4055 consumed_chars_base = consumed_chars;
4056
4057 if (charbuf >= charbuf_end)
4058 break;
4059
4060 ONE_MORE_BYTE (c);
4061
4062 if (c == '\r')
4063 {
4064 if (EQ (eol_type, Qdos))
4065 {
4066 if (src == src_end)
4067 {
4068 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4069 goto no_more_source;
4070 }
4071 if (*src == '\n')
4072 ONE_MORE_BYTE (c);
4073 }
4074 else if (EQ (eol_type, Qmac))
4075 c = '\n';
4076 }
4077 else
4078 {
4079 struct charset *charset;
4080 if (c < 0x80)
4081 charset = charset_roman;
4082 else
4083 {
4084 /* BIG5 -> Big5 */
4085 if (c < 0xA1 || c > 0xFE)
4086 goto invalid_code;
4087 ONE_MORE_BYTE (c1);
4088 if (c1 < 0x40 || (c1 > 0x7E && c1 < 0xA1) || c1 > 0xFE)
4089 goto invalid_code;
4090 c = c << 8 | c1;
4091 charset = charset_big5;
4092 }
4093 if (charset->id != charset_ascii
4094 && last_id != charset->id)
4095 {
4096 if (last_id != charset_ascii)
4097 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4098 last_id = charset->id;
4099 last_offset = char_offset;
4100 }
4101 CODING_DECODE_CHAR (coding, src, src_base, src_end, charset, c, c);
4102 }
4103
4104 *charbuf++ = c;
4105 char_offset++;
4106 continue;
4107
4108 invalid_code:
4109 src = src_base;
4110 consumed_chars = consumed_chars_base;
4111 ONE_MORE_BYTE (c);
4112 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4113 char_offset++;
4114 coding->errors++;
4115 }
4116
4117 no_more_source:
4118 if (last_id != charset_ascii)
4119 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4120 coding->consumed_char += consumed_chars_base;
4121 coding->consumed = src_base - coding->source;
4122 coding->charbuf_used = charbuf - coding->charbuf;
4123 }
4124
4125 /* See the above "GENERAL NOTES on `encode_coding_XXX ()' functions".
4126 This function can encode charsets `ascii', `katakana-jisx0201',
4127 `japanese-jisx0208', `chinese-big5-1', and `chinese-big5-2'. We
4128 are sure that all these charsets are registered as official charset
4129 (i.e. do not have extended leading-codes). Characters of other
4130 charsets are produced without any encoding. If SJIS_P is 1, encode
4131 SJIS text, else encode BIG5 text. */
4132
4133 static int
4134 encode_coding_sjis (coding)
4135 struct coding_system *coding;
4136 {
4137 int multibytep = coding->dst_multibyte;
4138 int *charbuf = coding->charbuf;
4139 int *charbuf_end = charbuf + coding->charbuf_used;
4140 unsigned char *dst = coding->destination + coding->produced;
4141 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4142 int safe_room = 4;
4143 int produced_chars = 0;
4144 Lisp_Object attrs, eol_type, charset_list, val;
4145 int ascii_compatible;
4146 struct charset *charset_roman, *charset_kanji, *charset_kana;
4147 int c;
4148
4149 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4150 val = charset_list;
4151 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4152 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4153 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
4154
4155 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4156
4157 while (charbuf < charbuf_end)
4158 {
4159 ASSURE_DESTINATION (safe_room);
4160 c = *charbuf++;
4161 /* Now encode the character C. */
4162 if (ASCII_CHAR_P (c) && ascii_compatible)
4163 EMIT_ONE_ASCII_BYTE (c);
4164 else if (CHAR_BYTE8_P (c))
4165 {
4166 c = CHAR_TO_BYTE8 (c);
4167 EMIT_ONE_BYTE (c);
4168 }
4169 else
4170 {
4171 unsigned code;
4172 struct charset *charset = char_charset (c, charset_list, &code);
4173
4174 if (!charset)
4175 {
4176 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4177 {
4178 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4179 charset = CHARSET_FROM_ID (charset_ascii);
4180 }
4181 else
4182 {
4183 c = coding->default_char;
4184 charset = char_charset (c, charset_list, &code);
4185 }
4186 }
4187 if (code == CHARSET_INVALID_CODE (charset))
4188 abort ();
4189 if (charset == charset_kanji)
4190 {
4191 int c1, c2;
4192 JIS_TO_SJIS (code);
4193 c1 = code >> 8, c2 = code & 0xFF;
4194 EMIT_TWO_BYTES (c1, c2);
4195 }
4196 else if (charset == charset_kana)
4197 EMIT_ONE_BYTE (code | 0x80);
4198 else
4199 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4200 }
4201 }
4202 coding->result = CODING_RESULT_SUCCESS;
4203 coding->produced_char += produced_chars;
4204 coding->produced = dst - coding->destination;
4205 return 0;
4206 }
4207
4208 static int
4209 encode_coding_big5 (coding)
4210 struct coding_system *coding;
4211 {
4212 int multibytep = coding->dst_multibyte;
4213 int *charbuf = coding->charbuf;
4214 int *charbuf_end = charbuf + coding->charbuf_used;
4215 unsigned char *dst = coding->destination + coding->produced;
4216 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4217 int safe_room = 4;
4218 int produced_chars = 0;
4219 Lisp_Object attrs, eol_type, charset_list, val;
4220 int ascii_compatible;
4221 struct charset *charset_roman, *charset_big5;
4222 int c;
4223
4224 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4225 val = charset_list;
4226 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
4227 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
4228 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4229
4230 while (charbuf < charbuf_end)
4231 {
4232 ASSURE_DESTINATION (safe_room);
4233 c = *charbuf++;
4234 /* Now encode the character C. */
4235 if (ASCII_CHAR_P (c) && ascii_compatible)
4236 EMIT_ONE_ASCII_BYTE (c);
4237 else if (CHAR_BYTE8_P (c))
4238 {
4239 c = CHAR_TO_BYTE8 (c);
4240 EMIT_ONE_BYTE (c);
4241 }
4242 else
4243 {
4244 unsigned code;
4245 struct charset *charset = char_charset (c, charset_list, &code);
4246
4247 if (! charset)
4248 {
4249 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4250 {
4251 code = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4252 charset = CHARSET_FROM_ID (charset_ascii);
4253 }
4254 else
4255 {
4256 c = coding->default_char;
4257 charset = char_charset (c, charset_list, &code);
4258 }
4259 }
4260 if (code == CHARSET_INVALID_CODE (charset))
4261 abort ();
4262 if (charset == charset_big5)
4263 {
4264 int c1, c2;
4265
4266 c1 = code >> 8, c2 = code & 0xFF;
4267 EMIT_TWO_BYTES (c1, c2);
4268 }
4269 else
4270 EMIT_ONE_ASCII_BYTE (code & 0x7F);
4271 }
4272 }
4273 coding->result = CODING_RESULT_SUCCESS;
4274 coding->produced_char += produced_chars;
4275 coding->produced = dst - coding->destination;
4276 return 0;
4277 }
4278
4279 \f
4280 /*** 10. CCL handlers ***/
4281
4282 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4283 Check if a text is encoded in a coding system of which
4284 encoder/decoder are written in CCL program. If it is, return
4285 CATEGORY_MASK_CCL, else return 0. */
4286
4287 static int
4288 detect_coding_ccl (coding, detect_info)
4289 struct coding_system *coding;
4290 struct coding_detection_info *detect_info;
4291 {
4292 unsigned char *src = coding->source, *src_base = src;
4293 unsigned char *src_end = coding->source + coding->src_bytes;
4294 int multibytep = coding->src_multibyte;
4295 int consumed_chars = 0;
4296 int found = 0;
4297 unsigned char *valids = CODING_CCL_VALIDS (coding);
4298 int head_ascii = coding->head_ascii;
4299 Lisp_Object attrs;
4300
4301 detect_info->checked |= CATEGORY_MASK_CCL;
4302
4303 coding = &coding_categories[coding_category_ccl];
4304 attrs = CODING_ID_ATTRS (coding->id);
4305 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4306 src += head_ascii;
4307
4308 while (1)
4309 {
4310 int c;
4311 ONE_MORE_BYTE (c);
4312 if (! valids[c])
4313 break;
4314 if ((valids[c] > 1))
4315 found = CATEGORY_MASK_CCL;
4316 }
4317 detect_info->rejected |= CATEGORY_MASK_CCL;
4318 return 0;
4319
4320 no_more_source:
4321 detect_info->found |= found;
4322 return 1;
4323 }
4324
4325 static void
4326 decode_coding_ccl (coding)
4327 struct coding_system *coding;
4328 {
4329 const unsigned char *src = coding->source + coding->consumed;
4330 unsigned char *src_end = coding->source + coding->src_bytes;
4331 int *charbuf = coding->charbuf;
4332 int *charbuf_end = charbuf + coding->charbuf_size;
4333 int consumed_chars = 0;
4334 int multibytep = coding->src_multibyte;
4335 struct ccl_program ccl;
4336 int source_charbuf[1024];
4337 int source_byteidx[1024];
4338 Lisp_Object attrs, eol_type, charset_list;
4339
4340 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4341 setup_ccl_program (&ccl, CODING_CCL_DECODER (coding));
4342
4343 while (src < src_end)
4344 {
4345 const unsigned char *p = src;
4346 int *source, *source_end;
4347 int i = 0;
4348
4349 if (multibytep)
4350 while (i < 1024 && p < src_end)
4351 {
4352 source_byteidx[i] = p - src;
4353 source_charbuf[i++] = STRING_CHAR_ADVANCE (p);
4354 }
4355 else
4356 while (i < 1024 && p < src_end)
4357 source_charbuf[i++] = *p++;
4358
4359 if (p == src_end && coding->mode & CODING_MODE_LAST_BLOCK)
4360 ccl.last_block = 1;
4361
4362 source = source_charbuf;
4363 source_end = source + i;
4364 while (source < source_end)
4365 {
4366 ccl_driver (&ccl, source, charbuf,
4367 source_end - source, charbuf_end - charbuf,
4368 charset_list);
4369 source += ccl.consumed;
4370 charbuf += ccl.produced;
4371 if (ccl.status != CCL_STAT_SUSPEND_BY_DST)
4372 break;
4373 }
4374 if (source < source_end)
4375 src += source_byteidx[source - source_charbuf];
4376 else
4377 src = p;
4378 consumed_chars += source - source_charbuf;
4379
4380 if (ccl.status != CCL_STAT_SUSPEND_BY_SRC
4381 && ccl.status != CODING_RESULT_INSUFFICIENT_SRC)
4382 break;
4383 }
4384
4385 switch (ccl.status)
4386 {
4387 case CCL_STAT_SUSPEND_BY_SRC:
4388 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4389 break;
4390 case CCL_STAT_SUSPEND_BY_DST:
4391 break;
4392 case CCL_STAT_QUIT:
4393 case CCL_STAT_INVALID_CMD:
4394 coding->result = CODING_RESULT_INTERRUPT;
4395 break;
4396 default:
4397 coding->result = CODING_RESULT_SUCCESS;
4398 break;
4399 }
4400 coding->consumed_char += consumed_chars;
4401 coding->consumed = src - coding->source;
4402 coding->charbuf_used = charbuf - coding->charbuf;
4403 }
4404
4405 static int
4406 encode_coding_ccl (coding)
4407 struct coding_system *coding;
4408 {
4409 struct ccl_program ccl;
4410 int multibytep = coding->dst_multibyte;
4411 int *charbuf = coding->charbuf;
4412 int *charbuf_end = charbuf + coding->charbuf_used;
4413 unsigned char *dst = coding->destination + coding->produced;
4414 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4415 unsigned char *adjusted_dst_end = dst_end - 1;
4416 int destination_charbuf[1024];
4417 int i, produced_chars = 0;
4418 Lisp_Object attrs, eol_type, charset_list;
4419
4420 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4421 setup_ccl_program (&ccl, CODING_CCL_ENCODER (coding));
4422
4423 ccl.last_block = coding->mode & CODING_MODE_LAST_BLOCK;
4424 ccl.dst_multibyte = coding->dst_multibyte;
4425
4426 while (charbuf < charbuf_end && dst < adjusted_dst_end)
4427 {
4428 int dst_bytes = dst_end - dst;
4429 if (dst_bytes > 1024)
4430 dst_bytes = 1024;
4431
4432 ccl_driver (&ccl, charbuf, destination_charbuf,
4433 charbuf_end - charbuf, dst_bytes, charset_list);
4434 charbuf += ccl.consumed;
4435 if (multibytep)
4436 for (i = 0; i < ccl.produced; i++)
4437 EMIT_ONE_BYTE (destination_charbuf[i] & 0xFF);
4438 else
4439 {
4440 for (i = 0; i < ccl.produced; i++)
4441 *dst++ = destination_charbuf[i] & 0xFF;
4442 produced_chars += ccl.produced;
4443 }
4444 }
4445
4446 switch (ccl.status)
4447 {
4448 case CCL_STAT_SUSPEND_BY_SRC:
4449 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4450 break;
4451 case CCL_STAT_SUSPEND_BY_DST:
4452 coding->result = CODING_RESULT_INSUFFICIENT_DST;
4453 break;
4454 case CCL_STAT_QUIT:
4455 case CCL_STAT_INVALID_CMD:
4456 coding->result = CODING_RESULT_INTERRUPT;
4457 break;
4458 default:
4459 coding->result = CODING_RESULT_SUCCESS;
4460 break;
4461 }
4462
4463 coding->produced_char += produced_chars;
4464 coding->produced = dst - coding->destination;
4465 return 0;
4466 }
4467
4468
4469 \f
4470 /*** 10, 11. no-conversion handlers ***/
4471
4472 /* See the above "GENERAL NOTES on `decode_coding_XXX ()' functions". */
4473
4474 static void
4475 decode_coding_raw_text (coding)
4476 struct coding_system *coding;
4477 {
4478 coding->chars_at_source = 1;
4479 coding->consumed_char = 0;
4480 coding->consumed = 0;
4481 coding->result = CODING_RESULT_SUCCESS;
4482 }
4483
4484 static int
4485 encode_coding_raw_text (coding)
4486 struct coding_system *coding;
4487 {
4488 int multibytep = coding->dst_multibyte;
4489 int *charbuf = coding->charbuf;
4490 int *charbuf_end = coding->charbuf + coding->charbuf_used;
4491 unsigned char *dst = coding->destination + coding->produced;
4492 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4493 int produced_chars = 0;
4494 int c;
4495
4496 if (multibytep)
4497 {
4498 int safe_room = MAX_MULTIBYTE_LENGTH * 2;
4499
4500 if (coding->src_multibyte)
4501 while (charbuf < charbuf_end)
4502 {
4503 ASSURE_DESTINATION (safe_room);
4504 c = *charbuf++;
4505 if (ASCII_CHAR_P (c))
4506 EMIT_ONE_ASCII_BYTE (c);
4507 else if (CHAR_BYTE8_P (c))
4508 {
4509 c = CHAR_TO_BYTE8 (c);
4510 EMIT_ONE_BYTE (c);
4511 }
4512 else
4513 {
4514 unsigned char str[MAX_MULTIBYTE_LENGTH], *p0 = str, *p1 = str;
4515
4516 CHAR_STRING_ADVANCE (c, p1);
4517 while (p0 < p1)
4518 {
4519 EMIT_ONE_BYTE (*p0);
4520 p0++;
4521 }
4522 }
4523 }
4524 else
4525 while (charbuf < charbuf_end)
4526 {
4527 ASSURE_DESTINATION (safe_room);
4528 c = *charbuf++;
4529 EMIT_ONE_BYTE (c);
4530 }
4531 }
4532 else
4533 {
4534 if (coding->src_multibyte)
4535 {
4536 int safe_room = MAX_MULTIBYTE_LENGTH;
4537
4538 while (charbuf < charbuf_end)
4539 {
4540 ASSURE_DESTINATION (safe_room);
4541 c = *charbuf++;
4542 if (ASCII_CHAR_P (c))
4543 *dst++ = c;
4544 else if (CHAR_BYTE8_P (c))
4545 *dst++ = CHAR_TO_BYTE8 (c);
4546 else
4547 CHAR_STRING_ADVANCE (c, dst);
4548 produced_chars++;
4549 }
4550 }
4551 else
4552 {
4553 ASSURE_DESTINATION (charbuf_end - charbuf);
4554 while (charbuf < charbuf_end && dst < dst_end)
4555 *dst++ = *charbuf++;
4556 produced_chars = dst - (coding->destination + coding->dst_bytes);
4557 }
4558 }
4559 coding->result = CODING_RESULT_SUCCESS;
4560 coding->produced_char += produced_chars;
4561 coding->produced = dst - coding->destination;
4562 return 0;
4563 }
4564
4565 /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions".
4566 Check if a text is encoded in a charset-based coding system. If it
4567 is, return 1, else return 0. */
4568
4569 static int
4570 detect_coding_charset (coding, detect_info)
4571 struct coding_system *coding;
4572 struct coding_detection_info *detect_info;
4573 {
4574 unsigned char *src = coding->source, *src_base = src;
4575 unsigned char *src_end = coding->source + coding->src_bytes;
4576 int multibytep = coding->src_multibyte;
4577 int consumed_chars = 0;
4578 Lisp_Object attrs, valids;
4579 int found = 0;
4580
4581 detect_info->checked |= CATEGORY_MASK_CHARSET;
4582
4583 coding = &coding_categories[coding_category_charset];
4584 attrs = CODING_ID_ATTRS (coding->id);
4585 valids = AREF (attrs, coding_attr_charset_valids);
4586
4587 if (! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
4588 src += coding->head_ascii;
4589
4590 while (1)
4591 {
4592 int c;
4593
4594 ONE_MORE_BYTE (c);
4595 if (NILP (AREF (valids, c)))
4596 break;
4597 if (c >= 0x80)
4598 found = CATEGORY_MASK_CHARSET;
4599 }
4600 detect_info->rejected |= CATEGORY_MASK_CHARSET;
4601 return 0;
4602
4603 no_more_source:
4604 detect_info->found |= found;
4605 return 1;
4606 }
4607
4608 static void
4609 decode_coding_charset (coding)
4610 struct coding_system *coding;
4611 {
4612 unsigned char *src = coding->source + coding->consumed;
4613 unsigned char *src_end = coding->source + coding->src_bytes;
4614 unsigned char *src_base;
4615 int *charbuf = coding->charbuf;
4616 int *charbuf_end = charbuf + coding->charbuf_size - MAX_ANNOTATION_LENGTH;
4617 int consumed_chars = 0, consumed_chars_base;
4618 int multibytep = coding->src_multibyte;
4619 Lisp_Object attrs, eol_type, charset_list, valids;
4620 int char_offset = coding->produced_char;
4621 int last_offset = char_offset;
4622 int last_id = charset_ascii;
4623
4624 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4625 valids = AREF (attrs, coding_attr_charset_valids);
4626
4627 while (1)
4628 {
4629 int c;
4630
4631 src_base = src;
4632 consumed_chars_base = consumed_chars;
4633
4634 if (charbuf >= charbuf_end)
4635 break;
4636
4637 ONE_MORE_BYTE (c);
4638 if (c == '\r')
4639 {
4640 /* Here we assume that no charset maps '\r' to something
4641 else. */
4642 if (EQ (eol_type, Qdos))
4643 {
4644 if (src == src_end)
4645 {
4646 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
4647 goto no_more_source;
4648 }
4649 if (*src == '\n')
4650 ONE_MORE_BYTE (c);
4651 }
4652 else if (EQ (eol_type, Qmac))
4653 c = '\n';
4654 }
4655 else
4656 {
4657 Lisp_Object val;
4658 struct charset *charset;
4659 int dim;
4660 int len = 1;
4661 unsigned code = c;
4662
4663 val = AREF (valids, c);
4664 if (NILP (val))
4665 goto invalid_code;
4666 if (INTEGERP (val))
4667 {
4668 charset = CHARSET_FROM_ID (XFASTINT (val));
4669 dim = CHARSET_DIMENSION (charset);
4670 while (len < dim)
4671 {
4672 ONE_MORE_BYTE (c);
4673 code = (code << 8) | c;
4674 len++;
4675 }
4676 CODING_DECODE_CHAR (coding, src, src_base, src_end,
4677 charset, code, c);
4678 }
4679 else
4680 {
4681 /* VAL is a list of charset IDs. It is assured that the
4682 list is sorted by charset dimensions (smaller one
4683 comes first). */
4684 while (CONSP (val))
4685 {
4686 charset = CHARSET_FROM_ID (XFASTINT (XCAR (val)));
4687 dim = CHARSET_DIMENSION (charset);
4688 while (len < dim)
4689 {
4690 ONE_MORE_BYTE (c);
4691 code = (code << 8) | c;
4692 len++;
4693 }
4694 CODING_DECODE_CHAR (coding, src, src_base,
4695 src_end, charset, code, c);
4696 if (c >= 0)
4697 break;
4698 val = XCDR (val);
4699 }
4700 }
4701 if (c < 0)
4702 goto invalid_code;
4703 if (charset->id != charset_ascii
4704 && last_id != charset->id)
4705 {
4706 if (last_id != charset_ascii)
4707 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4708 last_id = charset->id;
4709 last_offset = char_offset;
4710 }
4711 }
4712 *charbuf++ = c;
4713 char_offset++;
4714 continue;
4715
4716 invalid_code:
4717 src = src_base;
4718 consumed_chars = consumed_chars_base;
4719 ONE_MORE_BYTE (c);
4720 *charbuf++ = ASCII_BYTE_P (c) ? c : BYTE8_TO_CHAR (c);
4721 char_offset++;
4722 coding->errors++;
4723 }
4724
4725 no_more_source:
4726 if (last_id != charset_ascii)
4727 ADD_CHARSET_DATA (charbuf, last_offset, char_offset, last_id);
4728 coding->consumed_char += consumed_chars_base;
4729 coding->consumed = src_base - coding->source;
4730 coding->charbuf_used = charbuf - coding->charbuf;
4731 }
4732
4733 static int
4734 encode_coding_charset (coding)
4735 struct coding_system *coding;
4736 {
4737 int multibytep = coding->dst_multibyte;
4738 int *charbuf = coding->charbuf;
4739 int *charbuf_end = charbuf + coding->charbuf_used;
4740 unsigned char *dst = coding->destination + coding->produced;
4741 unsigned char *dst_end = coding->destination + coding->dst_bytes;
4742 int safe_room = MAX_MULTIBYTE_LENGTH;
4743 int produced_chars = 0;
4744 Lisp_Object attrs, eol_type, charset_list;
4745 int ascii_compatible;
4746 int c;
4747
4748 CODING_GET_INFO (coding, attrs, eol_type, charset_list);
4749 ascii_compatible = ! NILP (CODING_ATTR_ASCII_COMPAT (attrs));
4750
4751 while (charbuf < charbuf_end)
4752 {
4753 struct charset *charset;
4754 unsigned code;
4755
4756 ASSURE_DESTINATION (safe_room);
4757 c = *charbuf++;
4758 if (ascii_compatible && ASCII_CHAR_P (c))
4759 EMIT_ONE_ASCII_BYTE (c);
4760 else if (CHAR_BYTE8_P (c))
4761 {
4762 c = CHAR_TO_BYTE8 (c);
4763 EMIT_ONE_BYTE (c);
4764 }
4765 else
4766 {
4767 charset = char_charset (c, charset_list, &code);
4768 if (charset)
4769 {
4770 if (CHARSET_DIMENSION (charset) == 1)
4771 EMIT_ONE_BYTE (code);
4772 else if (CHARSET_DIMENSION (charset) == 2)
4773 EMIT_TWO_BYTES (code >> 8, code & 0xFF);
4774 else if (CHARSET_DIMENSION (charset) == 3)
4775 EMIT_THREE_BYTES (code >> 16, (code >> 8) & 0xFF, code & 0xFF);
4776 else
4777 EMIT_FOUR_BYTES (code >> 24, (code >> 16) & 0xFF,
4778 (code >> 8) & 0xFF, code & 0xFF);
4779 }
4780 else
4781 {
4782 if (coding->mode & CODING_MODE_SAFE_ENCODING)
4783 c = CODING_INHIBIT_CHARACTER_SUBSTITUTION;
4784 else
4785 c = coding->default_char;
4786 EMIT_ONE_BYTE (c);
4787 }
4788 }
4789 }
4790
4791 coding->result = CODING_RESULT_SUCCESS;
4792 coding->produced_char += produced_chars;
4793 coding->produced = dst - coding->destination;
4794 return 0;
4795 }
4796
4797 \f
4798 /*** 7. C library functions ***/
4799
4800 /* Setup coding context CODING from information about CODING_SYSTEM.
4801 If CODING_SYSTEM is nil, `no-conversion' is assumed. If
4802 CODING_SYSTEM is invalid, signal an error. */
4803
4804 void
4805 setup_coding_system (coding_system, coding)
4806 Lisp_Object coding_system;
4807 struct coding_system *coding;
4808 {
4809 Lisp_Object attrs;
4810 Lisp_Object eol_type;
4811 Lisp_Object coding_type;
4812 Lisp_Object val;
4813
4814 if (NILP (coding_system))
4815 coding_system = Qno_conversion;
4816
4817 CHECK_CODING_SYSTEM_GET_ID (coding_system, coding->id);
4818
4819 attrs = CODING_ID_ATTRS (coding->id);
4820 eol_type = CODING_ID_EOL_TYPE (coding->id);
4821
4822 coding->mode = 0;
4823 coding->head_ascii = -1;
4824 coding->common_flags
4825 = (VECTORP (eol_type) ? CODING_REQUIRE_DETECTION_MASK : 0);
4826 if (! NILP (CODING_ATTR_POST_READ (attrs)))
4827 coding->common_flags |= CODING_REQUIRE_DECODING_MASK;
4828 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
4829 coding->common_flags |= CODING_REQUIRE_ENCODING_MASK;
4830
4831 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4832 coding->max_charset_id = XSTRING (val)->size - 1;
4833 coding->safe_charsets = (char *) XSTRING (val)->data;
4834 coding->default_char = XINT (CODING_ATTR_DEFAULT_CHAR (attrs));
4835
4836 coding_type = CODING_ATTR_TYPE (attrs);
4837 if (EQ (coding_type, Qundecided))
4838 {
4839 coding->detector = NULL;
4840 coding->decoder = decode_coding_raw_text;
4841 coding->encoder = encode_coding_raw_text;
4842 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4843 }
4844 else if (EQ (coding_type, Qiso_2022))
4845 {
4846 int i;
4847 int flags = XINT (AREF (attrs, coding_attr_iso_flags));
4848
4849 /* Invoke graphic register 0 to plane 0. */
4850 CODING_ISO_INVOCATION (coding, 0) = 0;
4851 /* Invoke graphic register 1 to plane 1 if we can use 8-bit. */
4852 CODING_ISO_INVOCATION (coding, 1)
4853 = (flags & CODING_ISO_FLAG_SEVEN_BITS ? -1 : 1);
4854 /* Setup the initial status of designation. */
4855 for (i = 0; i < 4; i++)
4856 CODING_ISO_DESIGNATION (coding, i) = CODING_ISO_INITIAL (coding, i);
4857 /* Not single shifting initially. */
4858 CODING_ISO_SINGLE_SHIFTING (coding) = 0;
4859 /* Beginning of buffer should also be regarded as bol. */
4860 CODING_ISO_BOL (coding) = 1;
4861 coding->detector = detect_coding_iso_2022;
4862 coding->decoder = decode_coding_iso_2022;
4863 coding->encoder = encode_coding_iso_2022;
4864 if (flags & CODING_ISO_FLAG_SAFE)
4865 coding->mode |= CODING_MODE_SAFE_ENCODING;
4866 coding->common_flags
4867 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4868 | CODING_REQUIRE_FLUSHING_MASK);
4869 if (flags & CODING_ISO_FLAG_COMPOSITION)
4870 coding->common_flags |= CODING_ANNOTATE_COMPOSITION_MASK;
4871 if (flags & CODING_ISO_FLAG_DESIGNATION)
4872 coding->common_flags |= CODING_ANNOTATE_CHARSET_MASK;
4873 if (flags & CODING_ISO_FLAG_FULL_SUPPORT)
4874 {
4875 setup_iso_safe_charsets (attrs);
4876 val = CODING_ATTR_SAFE_CHARSETS (attrs);
4877 coding->max_charset_id = XSTRING (val)->size - 1;
4878 coding->safe_charsets = (char *) XSTRING (val)->data;
4879 }
4880 CODING_ISO_FLAGS (coding) = flags;
4881 }
4882 else if (EQ (coding_type, Qcharset))
4883 {
4884 coding->detector = detect_coding_charset;
4885 coding->decoder = decode_coding_charset;
4886 coding->encoder = encode_coding_charset;
4887 coding->common_flags
4888 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4889 }
4890 else if (EQ (coding_type, Qutf_8))
4891 {
4892 coding->detector = detect_coding_utf_8;
4893 coding->decoder = decode_coding_utf_8;
4894 coding->encoder = encode_coding_utf_8;
4895 coding->common_flags
4896 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4897 }
4898 else if (EQ (coding_type, Qutf_16))
4899 {
4900 val = AREF (attrs, coding_attr_utf_16_bom);
4901 CODING_UTF_16_BOM (coding) = (CONSP (val) ? utf_16_detect_bom
4902 : EQ (val, Qt) ? utf_16_with_bom
4903 : utf_16_without_bom);
4904 val = AREF (attrs, coding_attr_utf_16_endian);
4905 CODING_UTF_16_ENDIAN (coding) = (EQ (val, Qbig) ? utf_16_big_endian
4906 : utf_16_little_endian);
4907 CODING_UTF_16_SURROGATE (coding) = 0;
4908 coding->detector = detect_coding_utf_16;
4909 coding->decoder = decode_coding_utf_16;
4910 coding->encoder = encode_coding_utf_16;
4911 coding->common_flags
4912 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4913 if (CODING_UTF_16_BOM (coding) == utf_16_detect_bom)
4914 coding->common_flags |= CODING_REQUIRE_DETECTION_MASK;
4915 }
4916 else if (EQ (coding_type, Qccl))
4917 {
4918 coding->detector = detect_coding_ccl;
4919 coding->decoder = decode_coding_ccl;
4920 coding->encoder = encode_coding_ccl;
4921 coding->common_flags
4922 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK
4923 | CODING_REQUIRE_FLUSHING_MASK);
4924 }
4925 else if (EQ (coding_type, Qemacs_mule))
4926 {
4927 coding->detector = detect_coding_emacs_mule;
4928 coding->decoder = decode_coding_emacs_mule;
4929 coding->encoder = encode_coding_emacs_mule;
4930 coding->common_flags
4931 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4932 if (! NILP (AREF (attrs, coding_attr_emacs_mule_full))
4933 && ! EQ (CODING_ATTR_CHARSET_LIST (attrs), Vemacs_mule_charset_list))
4934 {
4935 Lisp_Object tail, safe_charsets;
4936 int max_charset_id = 0;
4937
4938 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4939 tail = XCDR (tail))
4940 if (max_charset_id < XFASTINT (XCAR (tail)))
4941 max_charset_id = XFASTINT (XCAR (tail));
4942 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
4943 make_number (255));
4944 for (tail = Vemacs_mule_charset_list; CONSP (tail);
4945 tail = XCDR (tail))
4946 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
4947 coding->max_charset_id = max_charset_id;
4948 coding->safe_charsets = (char *) XSTRING (safe_charsets)->data;
4949 }
4950 }
4951 else if (EQ (coding_type, Qshift_jis))
4952 {
4953 coding->detector = detect_coding_sjis;
4954 coding->decoder = decode_coding_sjis;
4955 coding->encoder = encode_coding_sjis;
4956 coding->common_flags
4957 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4958 }
4959 else if (EQ (coding_type, Qbig5))
4960 {
4961 coding->detector = detect_coding_big5;
4962 coding->decoder = decode_coding_big5;
4963 coding->encoder = encode_coding_big5;
4964 coding->common_flags
4965 |= (CODING_REQUIRE_DECODING_MASK | CODING_REQUIRE_ENCODING_MASK);
4966 }
4967 else /* EQ (coding_type, Qraw_text) */
4968 {
4969 coding->detector = NULL;
4970 coding->decoder = decode_coding_raw_text;
4971 coding->encoder = encode_coding_raw_text;
4972 coding->common_flags |= CODING_FOR_UNIBYTE_MASK;
4973 }
4974
4975 return;
4976 }
4977
4978 /* Return raw-text or one of its subsidiaries that has the same
4979 eol_type as CODING-SYSTEM. */
4980
4981 Lisp_Object
4982 raw_text_coding_system (coding_system)
4983 Lisp_Object coding_system;
4984 {
4985 Lisp_Object spec, attrs;
4986 Lisp_Object eol_type, raw_text_eol_type;
4987
4988 if (NILP (coding_system))
4989 return Qraw_text;
4990 spec = CODING_SYSTEM_SPEC (coding_system);
4991 attrs = AREF (spec, 0);
4992
4993 if (EQ (CODING_ATTR_TYPE (attrs), Qraw_text))
4994 return coding_system;
4995
4996 eol_type = AREF (spec, 2);
4997 if (VECTORP (eol_type))
4998 return Qraw_text;
4999 spec = CODING_SYSTEM_SPEC (Qraw_text);
5000 raw_text_eol_type = AREF (spec, 2);
5001 return (EQ (eol_type, Qunix) ? AREF (raw_text_eol_type, 0)
5002 : EQ (eol_type, Qdos) ? AREF (raw_text_eol_type, 1)
5003 : AREF (raw_text_eol_type, 2));
5004 }
5005
5006
5007 /* If CODING_SYSTEM doesn't specify end-of-line format but PARENT
5008 does, return one of the subsidiary that has the same eol-spec as
5009 PARENT. Otherwise, return CODING_SYSTEM. */
5010
5011 Lisp_Object
5012 coding_inherit_eol_type (coding_system, parent)
5013 Lisp_Object coding_system, parent;
5014 {
5015 Lisp_Object spec, attrs, eol_type;
5016
5017 if (NILP (coding_system))
5018 coding_system = Qraw_text;
5019 spec = CODING_SYSTEM_SPEC (coding_system);
5020 attrs = AREF (spec, 0);
5021 eol_type = AREF (spec, 2);
5022 if (VECTORP (eol_type)
5023 && ! NILP (parent))
5024 {
5025 Lisp_Object parent_spec;
5026 Lisp_Object parent_eol_type;
5027
5028 parent_spec
5029 = CODING_SYSTEM_SPEC (buffer_defaults.buffer_file_coding_system);
5030 parent_eol_type = AREF (parent_spec, 2);
5031 if (EQ (parent_eol_type, Qunix))
5032 coding_system = AREF (eol_type, 0);
5033 else if (EQ (parent_eol_type, Qdos))
5034 coding_system = AREF (eol_type, 1);
5035 else if (EQ (parent_eol_type, Qmac))
5036 coding_system = AREF (eol_type, 2);
5037 }
5038 return coding_system;
5039 }
5040
5041 /* Emacs has a mechanism to automatically detect a coding system if it
5042 is one of Emacs' internal format, ISO2022, SJIS, and BIG5. But,
5043 it's impossible to distinguish some coding systems accurately
5044 because they use the same range of codes. So, at first, coding
5045 systems are categorized into 7, those are:
5046
5047 o coding-category-emacs-mule
5048
5049 The category for a coding system which has the same code range
5050 as Emacs' internal format. Assigned the coding-system (Lisp
5051 symbol) `emacs-mule' by default.
5052
5053 o coding-category-sjis
5054
5055 The category for a coding system which has the same code range
5056 as SJIS. Assigned the coding-system (Lisp
5057 symbol) `japanese-shift-jis' by default.
5058
5059 o coding-category-iso-7
5060
5061 The category for a coding system which has the same code range
5062 as ISO2022 of 7-bit environment. This doesn't use any locking
5063 shift and single shift functions. This can encode/decode all
5064 charsets. Assigned the coding-system (Lisp symbol)
5065 `iso-2022-7bit' by default.
5066
5067 o coding-category-iso-7-tight
5068
5069 Same as coding-category-iso-7 except that this can
5070 encode/decode only the specified charsets.
5071
5072 o coding-category-iso-8-1
5073
5074 The category for a coding system which has the same code range
5075 as ISO2022 of 8-bit environment and graphic plane 1 used only
5076 for DIMENSION1 charset. This doesn't use any locking shift
5077 and single shift functions. Assigned the coding-system (Lisp
5078 symbol) `iso-latin-1' by default.
5079
5080 o coding-category-iso-8-2
5081
5082 The category for a coding system which has the same code range
5083 as ISO2022 of 8-bit environment and graphic plane 1 used only
5084 for DIMENSION2 charset. This doesn't use any locking shift
5085 and single shift functions. Assigned the coding-system (Lisp
5086 symbol) `japanese-iso-8bit' by default.
5087
5088 o coding-category-iso-7-else
5089
5090 The category for a coding system which has the same code range
5091 as ISO2022 of 7-bit environemnt but uses locking shift or
5092 single shift functions. Assigned the coding-system (Lisp
5093 symbol) `iso-2022-7bit-lock' by default.
5094
5095 o coding-category-iso-8-else
5096
5097 The category for a coding system which has the same code range
5098 as ISO2022 of 8-bit environemnt but uses locking shift or
5099 single shift functions. Assigned the coding-system (Lisp
5100 symbol) `iso-2022-8bit-ss2' by default.
5101
5102 o coding-category-big5
5103
5104 The category for a coding system which has the same code range
5105 as BIG5. Assigned the coding-system (Lisp symbol)
5106 `cn-big5' by default.
5107
5108 o coding-category-utf-8
5109
5110 The category for a coding system which has the same code range
5111 as UTF-8 (cf. RFC2279). Assigned the coding-system (Lisp
5112 symbol) `utf-8' by default.
5113
5114 o coding-category-utf-16-be
5115
5116 The category for a coding system in which a text has an
5117 Unicode signature (cf. Unicode Standard) in the order of BIG
5118 endian at the head. Assigned the coding-system (Lisp symbol)
5119 `utf-16-be' by default.
5120
5121 o coding-category-utf-16-le
5122
5123 The category for a coding system in which a text has an
5124 Unicode signature (cf. Unicode Standard) in the order of
5125 LITTLE endian at the head. Assigned the coding-system (Lisp
5126 symbol) `utf-16-le' by default.
5127
5128 o coding-category-ccl
5129
5130 The category for a coding system of which encoder/decoder is
5131 written in CCL programs. The default value is nil, i.e., no
5132 coding system is assigned.
5133
5134 o coding-category-binary
5135
5136 The category for a coding system not categorized in any of the
5137 above. Assigned the coding-system (Lisp symbol)
5138 `no-conversion' by default.
5139
5140 Each of them is a Lisp symbol and the value is an actual
5141 `coding-system's (this is also a Lisp symbol) assigned by a user.
5142 What Emacs does actually is to detect a category of coding system.
5143 Then, it uses a `coding-system' assigned to it. If Emacs can't
5144 decide only one possible category, it selects a category of the
5145 highest priority. Priorities of categories are also specified by a
5146 user in a Lisp variable `coding-category-list'.
5147
5148 */
5149
5150 #define EOL_SEEN_NONE 0
5151 #define EOL_SEEN_LF 1
5152 #define EOL_SEEN_CR 2
5153 #define EOL_SEEN_CRLF 4
5154
5155 /* Detect how end-of-line of a text of length SRC_BYTES pointed by
5156 SOURCE is encoded. If CATEGORY is one of
5157 coding_category_utf_16_XXXX, assume that CR and LF are encoded by
5158 two-byte, else they are encoded by one-byte.
5159
5160 Return one of EOL_SEEN_XXX. */
5161
5162 #define MAX_EOL_CHECK_COUNT 3
5163
5164 static int
5165 detect_eol (source, src_bytes, category)
5166 unsigned char *source;
5167 EMACS_INT src_bytes;
5168 enum coding_category category;
5169 {
5170 unsigned char *src = source, *src_end = src + src_bytes;
5171 unsigned char c;
5172 int total = 0;
5173 int eol_seen = EOL_SEEN_NONE;
5174
5175 if ((1 << category) & CATEGORY_MASK_UTF_16)
5176 {
5177 int msb, lsb;
5178
5179 msb = category == (coding_category_utf_16_le
5180 | coding_category_utf_16_le_nosig);
5181 lsb = 1 - msb;
5182
5183 while (src + 1 < src_end)
5184 {
5185 c = src[lsb];
5186 if (src[msb] == 0 && (c == '\n' || c == '\r'))
5187 {
5188 int this_eol;
5189
5190 if (c == '\n')
5191 this_eol = EOL_SEEN_LF;
5192 else if (src + 3 >= src_end
5193 || src[msb + 2] != 0
5194 || src[lsb + 2] != '\n')
5195 this_eol = EOL_SEEN_CR;
5196 else
5197 this_eol = EOL_SEEN_CRLF;
5198
5199 if (eol_seen == EOL_SEEN_NONE)
5200 /* This is the first end-of-line. */
5201 eol_seen = this_eol;
5202 else if (eol_seen != this_eol)
5203 {
5204 /* The found type is different from what found before. */
5205 eol_seen = EOL_SEEN_LF;
5206 break;
5207 }
5208 if (++total == MAX_EOL_CHECK_COUNT)
5209 break;
5210 }
5211 src += 2;
5212 }
5213 }
5214 else
5215 {
5216 while (src < src_end)
5217 {
5218 c = *src++;
5219 if (c == '\n' || c == '\r')
5220 {
5221 int this_eol;
5222
5223 if (c == '\n')
5224 this_eol = EOL_SEEN_LF;
5225 else if (src >= src_end || *src != '\n')
5226 this_eol = EOL_SEEN_CR;
5227 else
5228 this_eol = EOL_SEEN_CRLF, src++;
5229
5230 if (eol_seen == EOL_SEEN_NONE)
5231 /* This is the first end-of-line. */
5232 eol_seen = this_eol;
5233 else if (eol_seen != this_eol)
5234 {
5235 /* The found type is different from what found before. */
5236 eol_seen = EOL_SEEN_LF;
5237 break;
5238 }
5239 if (++total == MAX_EOL_CHECK_COUNT)
5240 break;
5241 }
5242 }
5243 }
5244 return eol_seen;
5245 }
5246
5247
5248 static void
5249 adjust_coding_eol_type (coding, eol_seen)
5250 struct coding_system *coding;
5251 int eol_seen;
5252 {
5253 Lisp_Object eol_type;
5254
5255 eol_type = CODING_ID_EOL_TYPE (coding->id);
5256 if (eol_seen & EOL_SEEN_LF)
5257 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 0));
5258 else if (eol_seen & EOL_SEEN_CRLF)
5259 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 1));
5260 else if (eol_seen & EOL_SEEN_CR)
5261 coding->id = CODING_SYSTEM_ID (AREF (eol_type, 2));
5262 }
5263
5264 /* Detect how a text specified in CODING is encoded. If a coding
5265 system is detected, update fields of CODING by the detected coding
5266 system. */
5267
5268 void
5269 detect_coding (coding)
5270 struct coding_system *coding;
5271 {
5272 unsigned char *src, *src_end;
5273 Lisp_Object attrs, coding_type;
5274
5275 coding->consumed = coding->consumed_char = 0;
5276 coding->produced = coding->produced_char = 0;
5277 coding_set_source (coding);
5278
5279 src_end = coding->source + coding->src_bytes;
5280
5281 /* If we have not yet decided the text encoding type, detect it
5282 now. */
5283 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qundecided))
5284 {
5285 int c, i;
5286
5287 for (src = coding->source; src < src_end; src++)
5288 {
5289 c = *src;
5290 if (c & 0x80 || (c < 0x20 && (c == ISO_CODE_ESC
5291 || c == ISO_CODE_SI
5292 || c == ISO_CODE_SO)))
5293 break;
5294 }
5295 coding->head_ascii = src - (coding->source + coding->consumed);
5296
5297 if (coding->head_ascii < coding->src_bytes)
5298 {
5299 struct coding_detection_info detect_info;
5300 enum coding_category category;
5301 struct coding_system *this;
5302
5303 detect_info.checked = detect_info.found = detect_info.rejected = 0;
5304 for (i = 0; i < coding_category_raw_text; i++)
5305 {
5306 category = coding_priorities[i];
5307 this = coding_categories + category;
5308 if (this->id < 0)
5309 {
5310 /* No coding system of this category is defined. */
5311 detect_info.rejected |= (1 << category);
5312 }
5313 else if (category >= coding_category_raw_text)
5314 continue;
5315 else if (detect_info.checked & (1 << category))
5316 {
5317 if (detect_info.found & (1 << category))
5318 break;
5319 }
5320 else if ((*(this->detector)) (coding, &detect_info)
5321 && detect_info.found & (1 << category))
5322 break;
5323 }
5324 if (i < coding_category_raw_text)
5325 setup_coding_system (CODING_ID_NAME (this->id), coding);
5326 else if (detect_info.rejected == CATEGORY_MASK_ANY)
5327 setup_coding_system (Qraw_text, coding);
5328 else if (detect_info.rejected)
5329 for (i = 0; i < coding_category_raw_text; i++)
5330 if (! (detect_info.rejected & (1 << coding_priorities[i])))
5331 {
5332 this = coding_categories + coding_priorities[i];
5333 setup_coding_system (CODING_ID_NAME (this->id), coding);
5334 break;
5335 }
5336 }
5337 }
5338 else if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qutf_16))
5339 {
5340 Lisp_Object coding_systems;
5341 struct coding_detection_info detect_info;
5342
5343 coding_systems
5344 = AREF (CODING_ID_ATTRS (coding->id), coding_attr_utf_16_bom);
5345 detect_info.found = detect_info.rejected = 0;
5346 if (CONSP (coding_systems)
5347 && detect_coding_utf_16 (coding, &detect_info)
5348 && (detect_info.found & (CATEGORY_MASK_UTF_16_LE
5349 | CATEGORY_MASK_UTF_16_BE)))
5350 {
5351 if (detect_info.found & CATEGORY_MASK_UTF_16_LE)
5352 setup_coding_system (XCAR (coding_systems), coding);
5353 else
5354 setup_coding_system (XCDR (coding_systems), coding);
5355 }
5356 }
5357
5358 attrs = CODING_ID_ATTRS (coding->id);
5359 coding_type = CODING_ATTR_TYPE (attrs);
5360
5361 /* If we have not yet decided the EOL type, detect it now. But, the
5362 detection is impossible for a CCL based coding system, in which
5363 case, we detct the EOL type after decoding. */
5364 if (VECTORP (CODING_ID_EOL_TYPE (coding->id))
5365 && ! EQ (coding_type, Qccl))
5366 {
5367 int eol_seen = detect_eol (coding->source, coding->src_bytes,
5368 XINT (CODING_ATTR_CATEGORY (attrs)));
5369
5370 if (eol_seen != EOL_SEEN_NONE)
5371 adjust_coding_eol_type (coding, eol_seen);
5372 }
5373 }
5374
5375
5376 static void
5377 decode_eol (coding)
5378 struct coding_system *coding;
5379 {
5380 if (VECTORP (CODING_ID_EOL_TYPE (coding->id)))
5381 {
5382 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5383 unsigned char *pend = p + coding->produced;
5384 int eol_seen = EOL_SEEN_NONE;
5385
5386 for (; p < pend; p++)
5387 {
5388 if (*p == '\n')
5389 eol_seen |= EOL_SEEN_LF;
5390 else if (*p == '\r')
5391 {
5392 if (p + 1 < pend && *(p + 1) == '\n')
5393 {
5394 eol_seen |= EOL_SEEN_CRLF;
5395 p++;
5396 }
5397 else
5398 eol_seen |= EOL_SEEN_CR;
5399 }
5400 }
5401 if (eol_seen != EOL_SEEN_NONE)
5402 adjust_coding_eol_type (coding, eol_seen);
5403 }
5404
5405 if (EQ (CODING_ID_EOL_TYPE (coding->id), Qmac))
5406 {
5407 unsigned char *p = CHAR_POS_ADDR (coding->dst_pos);
5408 unsigned char *pend = p + coding->produced;
5409
5410 for (; p < pend; p++)
5411 if (*p == '\r')
5412 *p = '\n';
5413 }
5414 else if (EQ (CODING_ID_EOL_TYPE (coding->id), Qdos))
5415 {
5416 unsigned char *p, *pbeg, *pend;
5417 Lisp_Object undo_list;
5418
5419 move_gap_both (coding->dst_pos + coding->produced_char,
5420 coding->dst_pos_byte + coding->produced);
5421 undo_list = current_buffer->undo_list;
5422 current_buffer->undo_list = Qt;
5423 del_range_2 (coding->dst_pos, coding->dst_pos_byte, GPT, GPT_BYTE, 0);
5424 current_buffer->undo_list = undo_list;
5425 pbeg = GPT_ADDR;
5426 pend = pbeg + coding->produced;
5427
5428 for (p = pend - 1; p >= pbeg; p--)
5429 if (*p == '\r')
5430 {
5431 safe_bcopy ((char *) (p + 1), (char *) p, pend - p - 1);
5432 pend--;
5433 }
5434 coding->produced_char -= coding->produced - (pend - pbeg);
5435 coding->produced = pend - pbeg;
5436 insert_from_gap (coding->produced_char, coding->produced);
5437 }
5438 }
5439
5440 static void
5441 translate_chars (coding, table)
5442 struct coding_system *coding;
5443 Lisp_Object table;
5444 {
5445 int *charbuf = coding->charbuf;
5446 int *charbuf_end = charbuf + coding->charbuf_used;
5447 int c;
5448
5449 if (coding->chars_at_source)
5450 return;
5451
5452 while (charbuf < charbuf_end)
5453 {
5454 c = *charbuf;
5455 if (c < 0)
5456 charbuf += c;
5457 else
5458 *charbuf++ = translate_char (table, c);
5459 }
5460 }
5461
5462 static int
5463 produce_chars (coding)
5464 struct coding_system *coding;
5465 {
5466 unsigned char *dst = coding->destination + coding->produced;
5467 unsigned char *dst_end = coding->destination + coding->dst_bytes;
5468 int produced;
5469 int produced_chars = 0;
5470
5471 if (! coding->chars_at_source)
5472 {
5473 /* Characters are in coding->charbuf. */
5474 int *buf = coding->charbuf;
5475 int *buf_end = buf + coding->charbuf_used;
5476 unsigned char *adjusted_dst_end;
5477
5478 if (BUFFERP (coding->src_object)
5479 && EQ (coding->src_object, coding->dst_object))
5480 dst_end = coding->source + coding->consumed;
5481 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5482
5483 while (buf < buf_end)
5484 {
5485 int c = *buf++;
5486
5487 if (dst >= adjusted_dst_end)
5488 {
5489 dst = alloc_destination (coding,
5490 buf_end - buf + MAX_MULTIBYTE_LENGTH,
5491 dst);
5492 dst_end = coding->destination + coding->dst_bytes;
5493 adjusted_dst_end = dst_end - MAX_MULTIBYTE_LENGTH;
5494 }
5495 if (c >= 0)
5496 {
5497 if (coding->dst_multibyte
5498 || ! CHAR_BYTE8_P (c))
5499 CHAR_STRING_ADVANCE (c, dst);
5500 else
5501 *dst++ = CHAR_TO_BYTE8 (c);
5502 produced_chars++;
5503 }
5504 else
5505 /* This is an annotation datum. (-C) is the length of
5506 it. */
5507 buf += -c - 1;
5508 }
5509 }
5510 else
5511 {
5512 unsigned char *src = coding->source;
5513 unsigned char *src_end = src + coding->src_bytes;
5514 Lisp_Object eol_type;
5515
5516 eol_type = CODING_ID_EOL_TYPE (coding->id);
5517
5518 if (coding->src_multibyte != coding->dst_multibyte)
5519 {
5520 if (coding->src_multibyte)
5521 {
5522 int multibytep = 1;
5523 int consumed_chars;
5524
5525 while (1)
5526 {
5527 unsigned char *src_base = src;
5528 int c;
5529
5530 ONE_MORE_BYTE (c);
5531 if (c == '\r')
5532 {
5533 if (EQ (eol_type, Qdos))
5534 {
5535 if (src == src_end)
5536 {
5537 coding->result = CODING_RESULT_INSUFFICIENT_SRC;
5538 goto no_more_source;
5539 }
5540 if (*src == '\n')
5541 c = *src++;
5542 }
5543 else if (EQ (eol_type, Qmac))
5544 c = '\n';
5545 }
5546 if (dst == dst_end)
5547 {
5548 coding->consumed = src - coding->source;
5549
5550 if (EQ (coding->src_object, coding->dst_object))
5551 dst_end = src;
5552 if (dst == dst_end)
5553 {
5554 dst = alloc_destination (coding, src_end - src + 1,
5555 dst);
5556 dst_end = coding->destination + coding->dst_bytes;
5557 coding_set_source (coding);
5558 src = coding->source + coding->consumed;
5559 src_end = coding->source + coding->src_bytes;
5560 }
5561 }
5562 *dst++ = c;
5563 produced_chars++;
5564 }
5565 no_more_source:
5566 ;
5567 }
5568 else
5569 while (src < src_end)
5570 {
5571 int multibytep = 1;
5572 int c = *src++;
5573
5574 if (c == '\r')
5575 {
5576 if (EQ (eol_type, Qdos))
5577 {
5578 if (src < src_end
5579 && *src == '\n')
5580 c = *src++;
5581 }
5582 else if (EQ (eol_type, Qmac))
5583 c = '\n';
5584 }
5585 if (dst >= dst_end - 1)
5586 {
5587 coding->consumed = src - coding->source;
5588
5589 if (EQ (coding->src_object, coding->dst_object))
5590 dst_end = src;
5591 if (dst >= dst_end - 1)
5592 {
5593 dst = alloc_destination (coding, src_end - src + 2,
5594 dst);
5595 dst_end = coding->destination + coding->dst_bytes;
5596 coding_set_source (coding);
5597 src = coding->source + coding->consumed;
5598 src_end = coding->source + coding->src_bytes;
5599 }
5600 }
5601 EMIT_ONE_BYTE (c);
5602 }
5603 }
5604 else
5605 {
5606 if (!EQ (coding->src_object, coding->dst_object))
5607 {
5608 int require = coding->src_bytes - coding->dst_bytes;
5609
5610 if (require > 0)
5611 {
5612 EMACS_INT offset = src - coding->source;
5613
5614 dst = alloc_destination (coding, require, dst);
5615 coding_set_source (coding);
5616 src = coding->source + offset;
5617 src_end = coding->source + coding->src_bytes;
5618 }
5619 }
5620 produced_chars = coding->src_chars;
5621 while (src < src_end)
5622 {
5623 int c = *src++;
5624
5625 if (c == '\r')
5626 {
5627 if (EQ (eol_type, Qdos))
5628 {
5629 if (src < src_end
5630 && *src == '\n')
5631 c = *src++;
5632 produced_chars--;
5633 }
5634 else if (EQ (eol_type, Qmac))
5635 c = '\n';
5636 }
5637 *dst++ = c;
5638 }
5639 }
5640 coding->consumed = coding->src_bytes;
5641 coding->consumed_char = coding->src_chars;
5642 }
5643
5644 produced = dst - (coding->destination + coding->produced);
5645 if (BUFFERP (coding->dst_object))
5646 insert_from_gap (produced_chars, produced);
5647 coding->produced += produced;
5648 coding->produced_char += produced_chars;
5649 return produced_chars;
5650 }
5651
5652 /* Compose text in CODING->object according to the annotation data at
5653 CHARBUF. CHARBUF is an array:
5654 [ -LENGTH ANNOTATION_MASK FROM TO METHOD COMP_LEN [ COMPONENTS... ] ]
5655 */
5656
5657 static INLINE void
5658 produce_composition (coding, charbuf)
5659 struct coding_system *coding;
5660 int *charbuf;
5661 {
5662 int len;
5663 EMACS_INT from, to;
5664 enum composition_method method;
5665 Lisp_Object components;
5666
5667 len = -charbuf[0];
5668 from = coding->dst_pos + charbuf[2];
5669 to = coding->dst_pos + charbuf[3];
5670 method = (enum composition_method) (charbuf[4]);
5671
5672 if (method == COMPOSITION_RELATIVE)
5673 components = Qnil;
5674 else
5675 {
5676 Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1];
5677 int i;
5678
5679 len -= 5;
5680 charbuf += 5;
5681 for (i = 0; i < len; i++)
5682 args[i] = make_number (charbuf[i]);
5683 components = (method == COMPOSITION_WITH_ALTCHARS
5684 ? Fstring (len, args) : Fvector (len, args));
5685 }
5686 compose_text (from, to, components, Qnil, coding->dst_object);
5687 }
5688
5689
5690 /* Put `charset' property on text in CODING->object according to
5691 the annotation data at CHARBUF. CHARBUF is an array:
5692 [ -LENGTH ANNOTATION_MASK FROM TO CHARSET-ID ]
5693 */
5694
5695 static INLINE void
5696 produce_charset (coding, charbuf)
5697 struct coding_system *coding;
5698 int *charbuf;
5699 {
5700 EMACS_INT from = coding->dst_pos + charbuf[2];
5701 EMACS_INT to = coding->dst_pos + charbuf[3];
5702 struct charset *charset = CHARSET_FROM_ID (charbuf[4]);
5703
5704 Fput_text_property (make_number (from), make_number (to),
5705 Qcharset, CHARSET_NAME (charset),
5706 coding->dst_object);
5707 }
5708
5709
5710 #define CHARBUF_SIZE 0x4000
5711
5712 #define ALLOC_CONVERSION_WORK_AREA(coding) \
5713 do { \
5714 int size = CHARBUF_SIZE;; \
5715 \
5716 coding->charbuf = NULL; \
5717 while (size > 1024) \
5718 { \
5719 coding->charbuf = (int *) alloca (sizeof (int) * size); \
5720 if (coding->charbuf) \
5721 break; \
5722 size >>= 1; \
5723 } \
5724 if (! coding->charbuf) \
5725 { \
5726 coding->result = CODING_RESULT_INSUFFICIENT_MEM; \
5727 return coding->result; \
5728 } \
5729 coding->charbuf_size = size; \
5730 } while (0)
5731
5732
5733 static void
5734 produce_annotation (coding)
5735 struct coding_system *coding;
5736 {
5737 int *charbuf = coding->charbuf;
5738 int *charbuf_end = charbuf + coding->charbuf_used;
5739
5740 if (NILP (coding->dst_object))
5741 return;
5742
5743 while (charbuf < charbuf_end)
5744 {
5745 if (*charbuf >= 0)
5746 charbuf++;
5747 else
5748 {
5749 int len = -*charbuf;
5750 switch (charbuf[1])
5751 {
5752 case CODING_ANNOTATE_COMPOSITION_MASK:
5753 produce_composition (coding, charbuf);
5754 break;
5755 case CODING_ANNOTATE_CHARSET_MASK:
5756 produce_charset (coding, charbuf);
5757 break;
5758 default:
5759 abort ();
5760 }
5761 charbuf += len;
5762 }
5763 }
5764 }
5765
5766 /* Decode the data at CODING->src_object into CODING->dst_object.
5767 CODING->src_object is a buffer, a string, or nil.
5768 CODING->dst_object is a buffer.
5769
5770 If CODING->src_object is a buffer, it must be the current buffer.
5771 In this case, if CODING->src_pos is positive, it is a position of
5772 the source text in the buffer, otherwise, the source text is in the
5773 gap area of the buffer, and CODING->src_pos specifies the offset of
5774 the text from GPT (which must be the same as PT). If this is the
5775 same buffer as CODING->dst_object, CODING->src_pos must be
5776 negative.
5777
5778 If CODING->src_object is a string, CODING->src_pos in an index to
5779 that string.
5780
5781 If CODING->src_object is nil, CODING->source must already point to
5782 the non-relocatable memory area. In this case, CODING->src_pos is
5783 an offset from CODING->source.
5784
5785 The decoded data is inserted at the current point of the buffer
5786 CODING->dst_object.
5787 */
5788
5789 static int
5790 decode_coding (coding)
5791 struct coding_system *coding;
5792 {
5793 Lisp_Object attrs;
5794
5795 if (BUFFERP (coding->src_object)
5796 && coding->src_pos > 0
5797 && coding->src_pos < GPT
5798 && coding->src_pos + coding->src_chars > GPT)
5799 move_gap_both (coding->src_pos, coding->src_pos_byte);
5800
5801 if (BUFFERP (coding->dst_object))
5802 {
5803 if (current_buffer != XBUFFER (coding->dst_object))
5804 set_buffer_internal (XBUFFER (coding->dst_object));
5805 if (GPT != PT)
5806 move_gap_both (PT, PT_BYTE);
5807 }
5808
5809 coding->consumed = coding->consumed_char = 0;
5810 coding->produced = coding->produced_char = 0;
5811 coding->chars_at_source = 0;
5812 coding->result = CODING_RESULT_SUCCESS;
5813 coding->errors = 0;
5814
5815 ALLOC_CONVERSION_WORK_AREA (coding);
5816
5817 attrs = CODING_ID_ATTRS (coding->id);
5818
5819 do
5820 {
5821 coding_set_source (coding);
5822 coding->annotated = 0;
5823 (*(coding->decoder)) (coding);
5824 if (!NILP (CODING_ATTR_DECODE_TBL (attrs)))
5825 translate_chars (coding, CODING_ATTR_DECODE_TBL (attrs));
5826 else if (!NILP (Vstandard_translation_table_for_decode))
5827 translate_chars (coding, Vstandard_translation_table_for_decode);
5828 coding_set_destination (coding);
5829 produce_chars (coding);
5830 if (coding->annotated)
5831 produce_annotation (coding);
5832 }
5833 while (coding->consumed < coding->src_bytes
5834 && ! coding->result);
5835
5836 if (EQ (CODING_ATTR_TYPE (CODING_ID_ATTRS (coding->id)), Qccl)
5837 && SYMBOLP (CODING_ID_EOL_TYPE (coding->id))
5838 && ! EQ (CODING_ID_EOL_TYPE (coding->id), Qunix))
5839 decode_eol (coding);
5840
5841 coding->carryover_bytes = 0;
5842 if (coding->consumed < coding->src_bytes)
5843 {
5844 int nbytes = coding->src_bytes - coding->consumed;
5845 unsigned char *src;
5846
5847 coding_set_source (coding);
5848 coding_set_destination (coding);
5849 src = coding->source + coding->consumed;
5850
5851 if (coding->mode & CODING_MODE_LAST_BLOCK)
5852 {
5853 /* Flush out unprocessed data as binary chars. We are sure
5854 that the number of data is less than the size of
5855 coding->charbuf. */
5856 while (nbytes-- > 0)
5857 {
5858 int c = *src++;
5859
5860 coding->charbuf[coding->charbuf_used++] = (c & 0x80 ? - c : c);
5861 }
5862 produce_chars (coding);
5863 }
5864 else
5865 {
5866 /* Record unprocessed bytes in coding->carryover. We are
5867 sure that the number of data is less than the size of
5868 coding->carryover. */
5869 unsigned char *p = coding->carryover;
5870
5871 coding->carryover_bytes = nbytes;
5872 while (nbytes-- > 0)
5873 *p++ = *src++;
5874 }
5875 coding->consumed = coding->src_bytes;
5876 }
5877
5878 return coding->result;
5879 }
5880
5881
5882 /* Extract an annotation datum from a composition starting at POS and
5883 ending before LIMIT of CODING->src_object (buffer or string), store
5884 the data in BUF, set *STOP to a starting position of the next
5885 composition (if any) or to LIMIT, and return the address of the
5886 next element of BUF.
5887
5888 If such an annotation is not found, set *STOP to a starting
5889 position of a composition after POS (if any) or to LIMIT, and
5890 return BUF. */
5891
5892 static INLINE int *
5893 handle_composition_annotation (pos, limit, coding, buf, stop)
5894 EMACS_INT pos, limit;
5895 struct coding_system *coding;
5896 int *buf;
5897 EMACS_INT *stop;
5898 {
5899 EMACS_INT start, end;
5900 Lisp_Object prop;
5901
5902 if (! find_composition (pos, limit, &start, &end, &prop, coding->src_object)
5903 || end > limit)
5904 *stop = limit;
5905 else if (start > pos)
5906 *stop = start;
5907 else
5908 {
5909 if (start == pos)
5910 {
5911 /* We found a composition. Store the corresponding
5912 annotation data in BUF. */
5913 int *head = buf;
5914 enum composition_method method = COMPOSITION_METHOD (prop);
5915 int nchars = COMPOSITION_LENGTH (prop);
5916
5917 ADD_COMPOSITION_DATA (buf, 0, nchars, method);
5918 if (method != COMPOSITION_RELATIVE)
5919 {
5920 Lisp_Object components;
5921 int len, i, i_byte;
5922
5923 components = COMPOSITION_COMPONENTS (prop);
5924 if (VECTORP (components))
5925 {
5926 len = XVECTOR (components)->size;
5927 for (i = 0; i < len; i++)
5928 *buf++ = XINT (AREF (components, i));
5929 }
5930 else if (STRINGP (components))
5931 {
5932 len = XSTRING (components)->size;
5933 i = i_byte = 0;
5934 while (i < len)
5935 {
5936 FETCH_STRING_CHAR_ADVANCE (*buf, components, i, i_byte);
5937 buf++;
5938 }
5939 }
5940 else if (INTEGERP (components))
5941 {
5942 len = 1;
5943 *buf++ = XINT (components);
5944 }
5945 else if (CONSP (components))
5946 {
5947 for (len = 0; CONSP (components);
5948 len++, components = XCDR (components))
5949 *buf++ = XINT (XCAR (components));
5950 }
5951 else
5952 abort ();
5953 *head -= len;
5954 }
5955 }
5956
5957 if (find_composition (end, limit, &start, &end, &prop,
5958 coding->src_object)
5959 && end <= limit)
5960 *stop = start;
5961 else
5962 *stop = limit;
5963 }
5964 return buf;
5965 }
5966
5967
5968 /* Extract an annotation datum from a text property `charset' at POS of
5969 CODING->src_object (buffer of string), store the data in BUF, set
5970 *STOP to the position where the value of `charset' property changes
5971 (limiting by LIMIT), and return the address of the next element of
5972 BUF.
5973
5974 If the property value is nil, set *STOP to the position where the
5975 property value is non-nil (limiting by LIMIT), and return BUF. */
5976
5977 static INLINE int *
5978 handle_charset_annotation (pos, limit, coding, buf, stop)
5979 EMACS_INT pos, limit;
5980 struct coding_system *coding;
5981 int *buf;
5982 EMACS_INT *stop;
5983 {
5984 Lisp_Object val, next;
5985 int id;
5986
5987 val = Fget_text_property (make_number (pos), Qcharset, coding->src_object);
5988 if (! NILP (val) && CHARSETP (val))
5989 id = XINT (CHARSET_SYMBOL_ID (val));
5990 else
5991 id = -1;
5992 ADD_CHARSET_DATA (buf, 0, 0, id);
5993 next = Fnext_single_property_change (make_number (pos), Qcharset,
5994 coding->src_object,
5995 make_number (limit));
5996 *stop = XINT (next);
5997 return buf;
5998 }
5999
6000
6001 static void
6002 consume_chars (coding)
6003 struct coding_system *coding;
6004 {
6005 int *buf = coding->charbuf;
6006 int *buf_end = coding->charbuf + coding->charbuf_size;
6007 const unsigned char *src = coding->source + coding->consumed;
6008 const unsigned char *src_end = coding->source + coding->src_bytes;
6009 EMACS_INT pos = coding->src_pos + coding->consumed_char;
6010 EMACS_INT end_pos = coding->src_pos + coding->src_chars;
6011 int multibytep = coding->src_multibyte;
6012 Lisp_Object eol_type;
6013 int c;
6014 EMACS_INT stop, stop_composition, stop_charset;
6015
6016 eol_type = CODING_ID_EOL_TYPE (coding->id);
6017 if (VECTORP (eol_type))
6018 eol_type = Qunix;
6019
6020 /* Note: composition handling is not yet implemented. */
6021 coding->common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
6022
6023 if (coding->common_flags & CODING_ANNOTATE_COMPOSITION_MASK)
6024 stop = stop_composition = pos;
6025 else
6026 stop = stop_composition = end_pos;
6027 if (coding->common_flags & CODING_ANNOTATE_CHARSET_MASK)
6028 stop = stop_charset = pos;
6029 else
6030 stop_charset = end_pos;
6031
6032 /* Compensate for CRLF and annotation. */
6033 buf_end -= 1 + MAX_ANNOTATION_LENGTH;
6034 while (buf < buf_end)
6035 {
6036 if (pos == stop)
6037 {
6038 if (pos == end_pos)
6039 break;
6040 if (pos == stop_composition)
6041 buf = handle_composition_annotation (pos, end_pos, coding,
6042 buf, &stop_composition);
6043 if (pos == stop_charset)
6044 buf = handle_charset_annotation (pos, end_pos, coding,
6045 buf, &stop_charset);
6046 stop = (stop_composition < stop_charset
6047 ? stop_composition : stop_charset);
6048 }
6049
6050 if (! multibytep)
6051 {
6052 EMACS_INT bytes;
6053
6054 if (! CODING_FOR_UNIBYTE (coding)
6055 && (bytes = MULTIBYTE_LENGTH (src, src_end)) > 0)
6056 c = STRING_CHAR_ADVANCE (src), pos += bytes;
6057 else
6058 c = *src++, pos++;
6059 }
6060 else
6061 c = STRING_CHAR_ADVANCE (src), pos++;
6062 if ((c == '\r') && (coding->mode & CODING_MODE_SELECTIVE_DISPLAY))
6063 c = '\n';
6064 if (! EQ (eol_type, Qunix))
6065 {
6066 if (c == '\n')
6067 {
6068 if (EQ (eol_type, Qdos))
6069 *buf++ = '\r';
6070 else
6071 c = '\r';
6072 }
6073 }
6074 *buf++ = c;
6075 }
6076
6077 coding->consumed = src - coding->source;
6078 coding->consumed_char = pos - coding->src_pos;
6079 coding->charbuf_used = buf - coding->charbuf;
6080 coding->chars_at_source = 0;
6081 }
6082
6083
6084 /* Encode the text at CODING->src_object into CODING->dst_object.
6085 CODING->src_object is a buffer or a string.
6086 CODING->dst_object is a buffer or nil.
6087
6088 If CODING->src_object is a buffer, it must be the current buffer.
6089 In this case, if CODING->src_pos is positive, it is a position of
6090 the source text in the buffer, otherwise. the source text is in the
6091 gap area of the buffer, and coding->src_pos specifies the offset of
6092 the text from GPT (which must be the same as PT). If this is the
6093 same buffer as CODING->dst_object, CODING->src_pos must be
6094 negative and CODING should not have `pre-write-conversion'.
6095
6096 If CODING->src_object is a string, CODING should not have
6097 `pre-write-conversion'.
6098
6099 If CODING->dst_object is a buffer, the encoded data is inserted at
6100 the current point of that buffer.
6101
6102 If CODING->dst_object is nil, the encoded data is placed at the
6103 memory area specified by CODING->destination. */
6104
6105 static int
6106 encode_coding (coding)
6107 struct coding_system *coding;
6108 {
6109 Lisp_Object attrs;
6110
6111 attrs = CODING_ID_ATTRS (coding->id);
6112
6113 if (BUFFERP (coding->dst_object))
6114 {
6115 set_buffer_internal (XBUFFER (coding->dst_object));
6116 coding->dst_multibyte
6117 = ! NILP (current_buffer->enable_multibyte_characters);
6118 }
6119
6120 coding->consumed = coding->consumed_char = 0;
6121 coding->produced = coding->produced_char = 0;
6122 coding->result = CODING_RESULT_SUCCESS;
6123 coding->errors = 0;
6124
6125 ALLOC_CONVERSION_WORK_AREA (coding);
6126
6127 do {
6128 coding_set_source (coding);
6129 consume_chars (coding);
6130
6131 if (!NILP (CODING_ATTR_ENCODE_TBL (attrs)))
6132 translate_chars (coding, CODING_ATTR_ENCODE_TBL (attrs));
6133 else if (!NILP (Vstandard_translation_table_for_encode))
6134 translate_chars (coding, Vstandard_translation_table_for_encode);
6135
6136 coding_set_destination (coding);
6137 (*(coding->encoder)) (coding);
6138 } while (coding->consumed_char < coding->src_chars);
6139
6140 if (BUFFERP (coding->dst_object))
6141 insert_from_gap (coding->produced_char, coding->produced);
6142
6143 return (coding->result);
6144 }
6145
6146
6147 /* Stack of working buffers used in code conversion. An nil element
6148 means that the code conversion of that level is not using a working
6149 buffer. */
6150 Lisp_Object Vcode_conversion_work_buf_list;
6151
6152 /* A working buffer used by the top level conversion. */
6153 Lisp_Object Vcode_conversion_reused_work_buf;
6154
6155
6156 /* Return a working buffer that can be freely used by the following
6157 code conversion. MULTIBYTEP specifies the multibyteness of the
6158 buffer. */
6159
6160 Lisp_Object
6161 make_conversion_work_buffer (multibytep, depth)
6162 int multibytep, depth;
6163 {
6164 struct buffer *current = current_buffer;
6165 Lisp_Object buf, name;
6166
6167 if (depth == 0)
6168 {
6169 if (NILP (Vcode_conversion_reused_work_buf))
6170 Vcode_conversion_reused_work_buf
6171 = Fget_buffer_create (build_string (" *code-converting-work<0>*"));
6172 buf = Vcode_conversion_reused_work_buf;
6173 }
6174 else
6175 {
6176 if (depth < 0)
6177 {
6178 name = build_string (" *code-converting-work*");
6179 name = Fgenerate_new_buffer_name (name, Qnil);
6180 }
6181 else
6182 {
6183 char str[128];
6184
6185 sprintf (str, " *code-converting-work*<%d>", depth);
6186 name = build_string (str);
6187 }
6188 buf = Fget_buffer_create (name);
6189 }
6190 set_buffer_internal (XBUFFER (buf));
6191 current_buffer->undo_list = Qt;
6192 Ferase_buffer ();
6193 Fset_buffer_multibyte (multibytep ? Qt : Qnil, Qnil);
6194 set_buffer_internal (current);
6195 return buf;
6196 }
6197
6198 static Lisp_Object
6199 code_conversion_restore (buffer)
6200 Lisp_Object buffer;
6201 {
6202 Lisp_Object workbuf;
6203
6204 workbuf = XCAR (Vcode_conversion_work_buf_list);
6205 if (! NILP (workbuf)
6206 && ! EQ (workbuf, Vcode_conversion_reused_work_buf)
6207 && ! NILP (Fbuffer_live_p (workbuf)))
6208 Fkill_buffer (workbuf);
6209 Vcode_conversion_work_buf_list = XCDR (Vcode_conversion_work_buf_list);
6210 set_buffer_internal (XBUFFER (buffer));
6211 return Qnil;
6212 }
6213
6214 static Lisp_Object
6215 code_conversion_save (buffer, with_work_buf, multibyte)
6216 Lisp_Object buffer;
6217 int with_work_buf, multibyte;
6218 {
6219 Lisp_Object workbuf;
6220
6221 if (with_work_buf)
6222 {
6223 int depth = XINT (Flength (Vcode_conversion_work_buf_list));
6224
6225 workbuf = make_conversion_work_buffer (multibyte, depth);
6226 }
6227 else
6228 workbuf = Qnil;
6229 Vcode_conversion_work_buf_list
6230 = Fcons (workbuf, Vcode_conversion_work_buf_list);
6231 record_unwind_protect (code_conversion_restore, buffer);
6232 return workbuf;
6233 }
6234
6235 int
6236 decode_coding_gap (coding, chars, bytes)
6237 struct coding_system *coding;
6238 EMACS_INT chars, bytes;
6239 {
6240 int count = specpdl_ptr - specpdl;
6241 Lisp_Object attrs;
6242 Lisp_Object buffer;
6243
6244 buffer = Fcurrent_buffer ();
6245 code_conversion_save (buffer, 0, 0);
6246
6247 coding->src_object = buffer;
6248 coding->src_chars = chars;
6249 coding->src_bytes = bytes;
6250 coding->src_pos = -chars;
6251 coding->src_pos_byte = -bytes;
6252 coding->src_multibyte = chars < bytes;
6253 coding->dst_object = buffer;
6254 coding->dst_pos = PT;
6255 coding->dst_pos_byte = PT_BYTE;
6256 coding->dst_multibyte = ! NILP (current_buffer->enable_multibyte_characters);
6257 coding->mode |= CODING_MODE_LAST_BLOCK;
6258
6259 if (CODING_REQUIRE_DETECTION (coding))
6260 detect_coding (coding);
6261
6262 decode_coding (coding);
6263
6264 attrs = CODING_ID_ATTRS (coding->id);
6265 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6266 {
6267 struct gcpro gcpro1;
6268 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6269 Lisp_Object val;
6270
6271 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6272 GCPRO1 (buffer);
6273 val = call1 (CODING_ATTR_POST_READ (attrs),
6274 make_number (coding->produced_char));
6275 UNGCPRO;
6276 CHECK_NATNUM (val);
6277 coding->produced_char += Z - prev_Z;
6278 coding->produced += Z_BYTE - prev_Z_BYTE;
6279 }
6280
6281 unbind_to (count, Qnil);
6282 return coding->result;
6283 }
6284
6285 int
6286 encode_coding_gap (coding, chars, bytes)
6287 struct coding_system *coding;
6288 EMACS_INT chars, bytes;
6289 {
6290 int count = specpdl_ptr - specpdl;
6291 Lisp_Object buffer;
6292
6293 buffer = Fcurrent_buffer ();
6294 code_conversion_save (buffer, 0, 0);
6295
6296 coding->src_object = buffer;
6297 coding->src_chars = chars;
6298 coding->src_bytes = bytes;
6299 coding->src_pos = -chars;
6300 coding->src_pos_byte = -bytes;
6301 coding->src_multibyte = chars < bytes;
6302 coding->dst_object = coding->src_object;
6303 coding->dst_pos = PT;
6304 coding->dst_pos_byte = PT_BYTE;
6305
6306 encode_coding (coding);
6307
6308 unbind_to (count, Qnil);
6309 return coding->result;
6310 }
6311
6312
6313 /* Decode the text in the range FROM/FROM_BYTE and TO/TO_BYTE in
6314 SRC_OBJECT into DST_OBJECT by coding context CODING.
6315
6316 SRC_OBJECT is a buffer, a string, or Qnil.
6317
6318 If it is a buffer, the text is at point of the buffer. FROM and TO
6319 are positions in the buffer.
6320
6321 If it is a string, the text is at the beginning of the string.
6322 FROM and TO are indices to the string.
6323
6324 If it is nil, the text is at coding->source. FROM and TO are
6325 indices to coding->source.
6326
6327 DST_OBJECT is a buffer, Qt, or Qnil.
6328
6329 If it is a buffer, the decoded text is inserted at point of the
6330 buffer. If the buffer is the same as SRC_OBJECT, the source text
6331 is deleted.
6332
6333 If it is Qt, a string is made from the decoded text, and
6334 set in CODING->dst_object.
6335
6336 If it is Qnil, the decoded text is stored at CODING->destination.
6337 The caller must allocate CODING->dst_bytes bytes at
6338 CODING->destination by xmalloc. If the decoded text is longer than
6339 CODING->dst_bytes, CODING->destination is relocated by xrealloc.
6340 */
6341
6342 void
6343 decode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6344 dst_object)
6345 struct coding_system *coding;
6346 Lisp_Object src_object;
6347 EMACS_INT from, from_byte, to, to_byte;
6348 Lisp_Object dst_object;
6349 {
6350 int count = specpdl_ptr - specpdl;
6351 unsigned char *destination;
6352 EMACS_INT dst_bytes;
6353 EMACS_INT chars = to - from;
6354 EMACS_INT bytes = to_byte - from_byte;
6355 Lisp_Object attrs;
6356 Lisp_Object buffer;
6357 int saved_pt = -1, saved_pt_byte;
6358
6359 buffer = Fcurrent_buffer ();
6360
6361 if (NILP (dst_object))
6362 {
6363 destination = coding->destination;
6364 dst_bytes = coding->dst_bytes;
6365 }
6366
6367 coding->src_object = src_object;
6368 coding->src_chars = chars;
6369 coding->src_bytes = bytes;
6370 coding->src_multibyte = chars < bytes;
6371
6372 if (STRINGP (src_object))
6373 {
6374 coding->src_pos = from;
6375 coding->src_pos_byte = from_byte;
6376 }
6377 else if (BUFFERP (src_object))
6378 {
6379 set_buffer_internal (XBUFFER (src_object));
6380 if (from != GPT)
6381 move_gap_both (from, from_byte);
6382 if (EQ (src_object, dst_object))
6383 {
6384 saved_pt = PT, saved_pt_byte = PT_BYTE;
6385 TEMP_SET_PT_BOTH (from, from_byte);
6386 del_range_both (from, from_byte, to, to_byte, 1);
6387 coding->src_pos = -chars;
6388 coding->src_pos_byte = -bytes;
6389 }
6390 else
6391 {
6392 coding->src_pos = from;
6393 coding->src_pos_byte = from_byte;
6394 }
6395 }
6396
6397 if (CODING_REQUIRE_DETECTION (coding))
6398 detect_coding (coding);
6399 attrs = CODING_ID_ATTRS (coding->id);
6400
6401 if (EQ (dst_object, Qt)
6402 || (! NILP (CODING_ATTR_POST_READ (attrs))
6403 && NILP (dst_object)))
6404 {
6405 coding->dst_object = code_conversion_save (buffer, 1, 1);
6406 coding->dst_pos = BEG;
6407 coding->dst_pos_byte = BEG_BYTE;
6408 coding->dst_multibyte = 1;
6409 }
6410 else if (BUFFERP (dst_object))
6411 {
6412 code_conversion_save (buffer, 0, 0);
6413 coding->dst_object = dst_object;
6414 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6415 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6416 coding->dst_multibyte
6417 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6418 }
6419 else
6420 {
6421 code_conversion_save (buffer, 0, 0);
6422 coding->dst_object = Qnil;
6423 coding->dst_multibyte = 1;
6424 }
6425
6426 decode_coding (coding);
6427
6428 if (BUFFERP (coding->dst_object))
6429 set_buffer_internal (XBUFFER (coding->dst_object));
6430
6431 if (! NILP (CODING_ATTR_POST_READ (attrs)))
6432 {
6433 struct gcpro gcpro1, gcpro2;
6434 EMACS_INT prev_Z = Z, prev_Z_BYTE = Z_BYTE;
6435 Lisp_Object val;
6436
6437 TEMP_SET_PT_BOTH (coding->dst_pos, coding->dst_pos_byte);
6438 GCPRO2 (coding->src_object, coding->dst_object);
6439 val = call1 (CODING_ATTR_POST_READ (attrs),
6440 make_number (coding->produced_char));
6441 UNGCPRO;
6442 CHECK_NATNUM (val);
6443 coding->produced_char += Z - prev_Z;
6444 coding->produced += Z_BYTE - prev_Z_BYTE;
6445 }
6446
6447 if (EQ (dst_object, Qt))
6448 {
6449 coding->dst_object = Fbuffer_string ();
6450 }
6451 else if (NILP (dst_object) && BUFFERP (coding->dst_object))
6452 {
6453 set_buffer_internal (XBUFFER (coding->dst_object));
6454 if (dst_bytes < coding->produced)
6455 {
6456 destination
6457 = (unsigned char *) xrealloc (destination, coding->produced);
6458 if (! destination)
6459 {
6460 coding->result = CODING_RESULT_INSUFFICIENT_DST;
6461 unbind_to (count, Qnil);
6462 return;
6463 }
6464 if (BEGV < GPT && GPT < BEGV + coding->produced_char)
6465 move_gap_both (BEGV, BEGV_BYTE);
6466 bcopy (BEGV_ADDR, destination, coding->produced);
6467 coding->destination = destination;
6468 }
6469 }
6470
6471 if (saved_pt >= 0)
6472 {
6473 /* This is the case of:
6474 (BUFFERP (src_object) && EQ (src_object, dst_object))
6475 As we have moved PT while replacing the original buffer
6476 contents, we must recover it now. */
6477 set_buffer_internal (XBUFFER (src_object));
6478 if (saved_pt < from)
6479 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6480 else if (saved_pt < from + chars)
6481 TEMP_SET_PT_BOTH (from, from_byte);
6482 else if (! NILP (current_buffer->enable_multibyte_characters))
6483 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6484 saved_pt_byte + (coding->produced - bytes));
6485 else
6486 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6487 saved_pt_byte + (coding->produced - bytes));
6488 }
6489
6490 unbind_to (count, Qnil);
6491 }
6492
6493
6494 void
6495 encode_coding_object (coding, src_object, from, from_byte, to, to_byte,
6496 dst_object)
6497 struct coding_system *coding;
6498 Lisp_Object src_object;
6499 EMACS_INT from, from_byte, to, to_byte;
6500 Lisp_Object dst_object;
6501 {
6502 int count = specpdl_ptr - specpdl;
6503 EMACS_INT chars = to - from;
6504 EMACS_INT bytes = to_byte - from_byte;
6505 Lisp_Object attrs;
6506 Lisp_Object buffer;
6507 int saved_pt = -1, saved_pt_byte;
6508
6509 buffer = Fcurrent_buffer ();
6510
6511 coding->src_object = src_object;
6512 coding->src_chars = chars;
6513 coding->src_bytes = bytes;
6514 coding->src_multibyte = chars < bytes;
6515
6516 attrs = CODING_ID_ATTRS (coding->id);
6517
6518 if (! NILP (CODING_ATTR_PRE_WRITE (attrs)))
6519 {
6520 coding->src_object = code_conversion_save (buffer, 1,
6521 coding->src_multibyte);
6522 set_buffer_internal (XBUFFER (coding->src_object));
6523 if (STRINGP (src_object))
6524 insert_from_string (src_object, from, from_byte, chars, bytes, 0);
6525 else if (BUFFERP (src_object))
6526 insert_from_buffer (XBUFFER (src_object), from, chars, 0);
6527 else
6528 insert_1_both (coding->source + from, chars, bytes, 0, 0, 0);
6529
6530 if (EQ (src_object, dst_object))
6531 {
6532 set_buffer_internal (XBUFFER (src_object));
6533 saved_pt = PT, saved_pt_byte = PT_BYTE;
6534 del_range_both (from, from_byte, to, to_byte, 1);
6535 set_buffer_internal (XBUFFER (coding->src_object));
6536 }
6537
6538 call2 (CODING_ATTR_PRE_WRITE (attrs),
6539 make_number (BEG), make_number (Z));
6540 coding->src_object = Fcurrent_buffer ();
6541 if (BEG != GPT)
6542 move_gap_both (BEG, BEG_BYTE);
6543 coding->src_chars = Z - BEG;
6544 coding->src_bytes = Z_BYTE - BEG_BYTE;
6545 coding->src_pos = BEG;
6546 coding->src_pos_byte = BEG_BYTE;
6547 coding->src_multibyte = Z < Z_BYTE;
6548 }
6549 else if (STRINGP (src_object))
6550 {
6551 code_conversion_save (buffer, 0, 0);
6552 coding->src_pos = from;
6553 coding->src_pos_byte = from_byte;
6554 }
6555 else if (BUFFERP (src_object))
6556 {
6557 code_conversion_save (buffer, 0, 0);
6558 set_buffer_internal (XBUFFER (src_object));
6559 if (EQ (src_object, dst_object))
6560 {
6561 saved_pt = PT, saved_pt_byte = PT_BYTE;
6562 coding->src_object = del_range_1 (from, to, 1, 1);
6563 coding->src_pos = 0;
6564 coding->src_pos_byte = 0;
6565 }
6566 else
6567 {
6568 if (from < GPT && to >= GPT)
6569 move_gap_both (from, from_byte);
6570 coding->src_pos = from;
6571 coding->src_pos_byte = from_byte;
6572 }
6573 }
6574 else
6575 code_conversion_save (buffer, 0, 0);
6576
6577 if (BUFFERP (dst_object))
6578 {
6579 coding->dst_object = dst_object;
6580 if (EQ (src_object, dst_object))
6581 {
6582 coding->dst_pos = from;
6583 coding->dst_pos_byte = from_byte;
6584 }
6585 else
6586 {
6587 coding->dst_pos = BUF_PT (XBUFFER (dst_object));
6588 coding->dst_pos_byte = BUF_PT_BYTE (XBUFFER (dst_object));
6589 }
6590 coding->dst_multibyte
6591 = ! NILP (XBUFFER (dst_object)->enable_multibyte_characters);
6592 }
6593 else if (EQ (dst_object, Qt))
6594 {
6595 coding->dst_object = Qnil;
6596 coding->dst_bytes = coding->src_chars;
6597 if (coding->dst_bytes == 0)
6598 coding->dst_bytes = 1;
6599 coding->destination = (unsigned char *) xmalloc (coding->dst_bytes);
6600 coding->dst_multibyte = 0;
6601 }
6602 else
6603 {
6604 coding->dst_object = Qnil;
6605 coding->dst_multibyte = 0;
6606 }
6607
6608 encode_coding (coding);
6609
6610 if (EQ (dst_object, Qt))
6611 {
6612 if (BUFFERP (coding->dst_object))
6613 coding->dst_object = Fbuffer_string ();
6614 else
6615 {
6616 coding->dst_object
6617 = make_unibyte_string ((char *) coding->destination,
6618 coding->produced);
6619 xfree (coding->destination);
6620 }
6621 }
6622
6623 if (saved_pt >= 0)
6624 {
6625 /* This is the case of:
6626 (BUFFERP (src_object) && EQ (src_object, dst_object))
6627 As we have moved PT while replacing the original buffer
6628 contents, we must recover it now. */
6629 set_buffer_internal (XBUFFER (src_object));
6630 if (saved_pt < from)
6631 TEMP_SET_PT_BOTH (saved_pt, saved_pt_byte);
6632 else if (saved_pt < from + chars)
6633 TEMP_SET_PT_BOTH (from, from_byte);
6634 else if (! NILP (current_buffer->enable_multibyte_characters))
6635 TEMP_SET_PT_BOTH (saved_pt + (coding->produced_char - chars),
6636 saved_pt_byte + (coding->produced - bytes));
6637 else
6638 TEMP_SET_PT_BOTH (saved_pt + (coding->produced - bytes),
6639 saved_pt_byte + (coding->produced - bytes));
6640 }
6641
6642 unbind_to (count, Qnil);
6643 }
6644
6645
6646 Lisp_Object
6647 preferred_coding_system ()
6648 {
6649 int id = coding_categories[coding_priorities[0]].id;
6650
6651 return CODING_ID_NAME (id);
6652 }
6653
6654 \f
6655 #ifdef emacs
6656 /*** 8. Emacs Lisp library functions ***/
6657
6658 DEFUN ("coding-system-p", Fcoding_system_p, Scoding_system_p, 1, 1, 0,
6659 doc: /* Return t if OBJECT is nil or a coding-system.
6660 See the documentation of `define-coding-system' for information
6661 about coding-system objects. */)
6662 (obj)
6663 Lisp_Object obj;
6664 {
6665 return ((NILP (obj) || CODING_SYSTEM_P (obj)) ? Qt : Qnil);
6666 }
6667
6668 DEFUN ("read-non-nil-coding-system", Fread_non_nil_coding_system,
6669 Sread_non_nil_coding_system, 1, 1, 0,
6670 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT. */)
6671 (prompt)
6672 Lisp_Object prompt;
6673 {
6674 Lisp_Object val;
6675 do
6676 {
6677 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6678 Qt, Qnil, Qcoding_system_history, Qnil, Qnil);
6679 }
6680 while (XSTRING (val)->size == 0);
6681 return (Fintern (val, Qnil));
6682 }
6683
6684 DEFUN ("read-coding-system", Fread_coding_system, Sread_coding_system, 1, 2, 0,
6685 doc: /* Read a coding system from the minibuffer, prompting with string PROMPT.
6686 If the user enters null input, return second argument DEFAULT-CODING-SYSTEM. */)
6687 (prompt, default_coding_system)
6688 Lisp_Object prompt, default_coding_system;
6689 {
6690 Lisp_Object val;
6691 if (SYMBOLP (default_coding_system))
6692 XSETSTRING (default_coding_system, XSYMBOL (default_coding_system)->name);
6693 val = Fcompleting_read (prompt, Vcoding_system_alist, Qnil,
6694 Qt, Qnil, Qcoding_system_history,
6695 default_coding_system, Qnil);
6696 return (XSTRING (val)->size == 0 ? Qnil : Fintern (val, Qnil));
6697 }
6698
6699 DEFUN ("check-coding-system", Fcheck_coding_system, Scheck_coding_system,
6700 1, 1, 0,
6701 doc: /* Check validity of CODING-SYSTEM.
6702 If valid, return CODING-SYSTEM, else signal a `coding-system-error' error. */)
6703 (coding_system)
6704 Lisp_Object coding_system;
6705 {
6706 CHECK_SYMBOL (coding_system);
6707 if (!NILP (Fcoding_system_p (coding_system)))
6708 return coding_system;
6709 while (1)
6710 Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil));
6711 }
6712
6713 \f
6714 /* Detect how the bytes at SRC of length SRC_BYTES are encoded. If
6715 HIGHEST is nonzero, return the coding system of the highest
6716 priority among the detected coding systems. Otherwize return a
6717 list of detected coding systems sorted by their priorities. If
6718 MULTIBYTEP is nonzero, it is assumed that the bytes are in correct
6719 multibyte form but contains only ASCII and eight-bit chars.
6720 Otherwise, the bytes are raw bytes.
6721
6722 CODING-SYSTEM controls the detection as below:
6723
6724 If it is nil, detect both text-format and eol-format. If the
6725 text-format part of CODING-SYSTEM is already specified
6726 (e.g. `iso-latin-1'), detect only eol-format. If the eol-format
6727 part of CODING-SYSTEM is already specified (e.g. `undecided-unix'),
6728 detect only text-format. */
6729
6730 Lisp_Object
6731 detect_coding_system (src, src_bytes, highest, multibytep, coding_system)
6732 unsigned char *src;
6733 int src_bytes, highest;
6734 int multibytep;
6735 Lisp_Object coding_system;
6736 {
6737 unsigned char *src_end = src + src_bytes;
6738 Lisp_Object attrs, eol_type;
6739 Lisp_Object val;
6740 struct coding_system coding;
6741 int id;
6742 struct coding_detection_info detect_info;
6743
6744 if (NILP (coding_system))
6745 coding_system = Qundecided;
6746 setup_coding_system (coding_system, &coding);
6747 attrs = CODING_ID_ATTRS (coding.id);
6748 eol_type = CODING_ID_EOL_TYPE (coding.id);
6749 coding_system = CODING_ATTR_BASE_NAME (attrs);
6750
6751 coding.source = src;
6752 coding.src_bytes = src_bytes;
6753 coding.src_multibyte = multibytep;
6754 coding.consumed = 0;
6755 coding.mode |= CODING_MODE_LAST_BLOCK;
6756
6757 detect_info.checked = detect_info.found = detect_info.rejected = 0;
6758
6759 /* At first, detect text-format if necessary. */
6760 if (XINT (CODING_ATTR_CATEGORY (attrs)) == coding_category_undecided)
6761 {
6762 enum coding_category category;
6763 struct coding_system *this;
6764 int c, i;
6765
6766 for (; src < src_end; src++)
6767 {
6768 c = *src;
6769 if (c & 0x80
6770 || (c < 0x20 && (c == ISO_CODE_ESC
6771 || c == ISO_CODE_SI
6772 || c == ISO_CODE_SO)))
6773 break;
6774 }
6775 coding.head_ascii = src - coding.source;
6776
6777 if (src < src_end)
6778 for (i = 0; i < coding_category_raw_text; i++)
6779 {
6780 category = coding_priorities[i];
6781 this = coding_categories + category;
6782
6783 if (this->id < 0)
6784 {
6785 /* No coding system of this category is defined. */
6786 detect_info.rejected |= (1 << category);
6787 }
6788 else if (category >= coding_category_raw_text)
6789 continue;
6790 else if (detect_info.checked & (1 << category))
6791 {
6792 if (highest
6793 && (detect_info.found & (1 << category)))
6794 break;
6795 }
6796 else
6797 {
6798 if ((*(this->detector)) (&coding, &detect_info)
6799 && highest
6800 && (detect_info.found & (1 << category)))
6801 break;
6802 }
6803 }
6804
6805
6806 if (detect_info.rejected == CATEGORY_MASK_ANY)
6807 {
6808 detect_info.found = CATEGORY_MASK_RAW_TEXT;
6809 id = coding_categories[coding_category_raw_text].id;
6810 val = Fcons (make_number (id), Qnil);
6811 }
6812 else if (! detect_info.rejected && ! detect_info.found)
6813 {
6814 detect_info.found = CATEGORY_MASK_ANY;
6815 id = coding_categories[coding_category_undecided].id;
6816 val = Fcons (make_number (id), Qnil);
6817 }
6818 else if (highest)
6819 {
6820 if (detect_info.found)
6821 {
6822 detect_info.found = 1 << category;
6823 val = Fcons (make_number (this->id), Qnil);
6824 }
6825 else
6826 for (i = 0; i < coding_category_raw_text; i++)
6827 if (! (detect_info.rejected & (1 << coding_priorities[i])))
6828 {
6829 detect_info.found = 1 << coding_priorities[i];
6830 id = coding_categories[coding_priorities[i]].id;
6831 val = Fcons (make_number (id), Qnil);
6832 break;
6833 }
6834 }
6835 else
6836 {
6837 int mask = detect_info.rejected | detect_info.found;
6838 int found = 0;
6839 val = Qnil;
6840
6841 for (i = coding_category_raw_text - 1; i >= 0; i--)
6842 {
6843 category = coding_priorities[i];
6844 if (! (mask & (1 << category)))
6845 {
6846 found |= 1 << category;
6847 id = coding_categories[category].id;
6848 val = Fcons (make_number (id), val);
6849 }
6850 }
6851 for (i = coding_category_raw_text - 1; i >= 0; i--)
6852 {
6853 category = coding_priorities[i];
6854 if (detect_info.found & (1 << category))
6855 {
6856 id = coding_categories[category].id;
6857 val = Fcons (make_number (id), val);
6858 }
6859 }
6860 detect_info.found |= found;
6861 }
6862 }
6863 else
6864 {
6865 detect_info.found = 1 << XINT (CODING_ATTR_CATEGORY (attrs));
6866 val = Fcons (make_number (coding.id), Qnil);
6867 }
6868
6869 /* Then, detect eol-format if necessary. */
6870 {
6871 int normal_eol = -1, utf_16_be_eol = -1, utf_16_le_eol;
6872 Lisp_Object tail;
6873
6874 if (VECTORP (eol_type))
6875 {
6876 if (detect_info.found & ~CATEGORY_MASK_UTF_16)
6877 normal_eol = detect_eol (coding.source, src_bytes,
6878 coding_category_raw_text);
6879 if (detect_info.found & (CATEGORY_MASK_UTF_16_BE
6880 | CATEGORY_MASK_UTF_16_BE_NOSIG))
6881 utf_16_be_eol = detect_eol (coding.source, src_bytes,
6882 coding_category_utf_16_be);
6883 if (detect_info.found & (CATEGORY_MASK_UTF_16_LE
6884 | CATEGORY_MASK_UTF_16_LE_NOSIG))
6885 utf_16_le_eol = detect_eol (coding.source, src_bytes,
6886 coding_category_utf_16_le);
6887 }
6888 else
6889 {
6890 if (EQ (eol_type, Qunix))
6891 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_LF;
6892 else if (EQ (eol_type, Qdos))
6893 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CRLF;
6894 else
6895 normal_eol = utf_16_be_eol = utf_16_le_eol = EOL_SEEN_CR;
6896 }
6897
6898 for (tail = val; CONSP (tail); tail = XCDR (tail))
6899 {
6900 enum coding_category category;
6901 int this_eol;
6902
6903 id = XINT (XCAR (tail));
6904 attrs = CODING_ID_ATTRS (id);
6905 category = XINT (CODING_ATTR_CATEGORY (attrs));
6906 eol_type = CODING_ID_EOL_TYPE (id);
6907 if (VECTORP (eol_type))
6908 {
6909 if (category == coding_category_utf_16_be
6910 || category == coding_category_utf_16_be_nosig)
6911 this_eol = utf_16_be_eol;
6912 else if (category == coding_category_utf_16_le
6913 || category == coding_category_utf_16_le_nosig)
6914 this_eol = utf_16_le_eol;
6915 else
6916 this_eol = normal_eol;
6917
6918 if (this_eol == EOL_SEEN_LF)
6919 XSETCAR (tail, AREF (eol_type, 0));
6920 else if (this_eol == EOL_SEEN_CRLF)
6921 XSETCAR (tail, AREF (eol_type, 1));
6922 else if (this_eol == EOL_SEEN_CR)
6923 XSETCAR (tail, AREF (eol_type, 2));
6924 else
6925 XSETCAR (tail, CODING_ID_NAME (id));
6926 }
6927 else
6928 XSETCAR (tail, CODING_ID_NAME (id));
6929 }
6930 }
6931
6932 return (highest ? XCAR (val) : val);
6933 }
6934
6935
6936 DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region,
6937 2, 3, 0,
6938 doc: /* Detect coding system of the text in the region between START and END.
6939 Return a list of possible coding systems ordered by priority.
6940
6941 If only ASCII characters are found, it returns a list of single element
6942 `undecided' or its subsidiary coding system according to a detected
6943 end-of-line format.
6944
6945 If optional argument HIGHEST is non-nil, return the coding system of
6946 highest priority. */)
6947 (start, end, highest)
6948 Lisp_Object start, end, highest;
6949 {
6950 int from, to;
6951 int from_byte, to_byte;
6952
6953 CHECK_NUMBER_COERCE_MARKER (start);
6954 CHECK_NUMBER_COERCE_MARKER (end);
6955
6956 validate_region (&start, &end);
6957 from = XINT (start), to = XINT (end);
6958 from_byte = CHAR_TO_BYTE (from);
6959 to_byte = CHAR_TO_BYTE (to);
6960
6961 if (from < GPT && to >= GPT)
6962 move_gap_both (to, to_byte);
6963
6964 return detect_coding_system (BYTE_POS_ADDR (from_byte),
6965 to_byte - from_byte,
6966 !NILP (highest),
6967 !NILP (current_buffer
6968 ->enable_multibyte_characters),
6969 Qnil);
6970 }
6971
6972 DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string,
6973 1, 2, 0,
6974 doc: /* Detect coding system of the text in STRING.
6975 Return a list of possible coding systems ordered by priority.
6976
6977 If only ASCII characters are found, it returns a list of single element
6978 `undecided' or its subsidiary coding system according to a detected
6979 end-of-line format.
6980
6981 If optional argument HIGHEST is non-nil, return the coding system of
6982 highest priority. */)
6983 (string, highest)
6984 Lisp_Object string, highest;
6985 {
6986 CHECK_STRING (string);
6987
6988 return detect_coding_system (XSTRING (string)->data,
6989 STRING_BYTES (XSTRING (string)),
6990 !NILP (highest),
6991 STRING_MULTIBYTE (string),
6992 Qnil);
6993 }
6994
6995
6996 static INLINE int
6997 char_encodable_p (c, attrs)
6998 int c;
6999 Lisp_Object attrs;
7000 {
7001 Lisp_Object tail;
7002 struct charset *charset;
7003
7004 for (tail = CODING_ATTR_CHARSET_LIST (attrs);
7005 CONSP (tail); tail = XCDR (tail))
7006 {
7007 charset = CHARSET_FROM_ID (XINT (XCAR (tail)));
7008 if (CHAR_CHARSET_P (c, charset))
7009 break;
7010 }
7011 return (! NILP (tail));
7012 }
7013
7014
7015 /* Return a list of coding systems that safely encode the text between
7016 START and END. If EXCLUDE is non-nil, it is a list of coding
7017 systems not to check. The returned list doesn't contain any such
7018 coding systems. In any case, if the text contains only ASCII or is
7019 unibyte, return t. */
7020
7021 DEFUN ("find-coding-systems-region-internal",
7022 Ffind_coding_systems_region_internal,
7023 Sfind_coding_systems_region_internal, 2, 3, 0,
7024 doc: /* Internal use only. */)
7025 (start, end, exclude)
7026 Lisp_Object start, end, exclude;
7027 {
7028 Lisp_Object coding_attrs_list, safe_codings;
7029 EMACS_INT start_byte, end_byte;
7030 const unsigned char *p, *pbeg, *pend;
7031 int c;
7032 Lisp_Object tail, elt;
7033
7034 if (STRINGP (start))
7035 {
7036 if (!STRING_MULTIBYTE (start)
7037 || XSTRING (start)->size == STRING_BYTES (XSTRING (start)))
7038 return Qt;
7039 start_byte = 0;
7040 end_byte = STRING_BYTES (XSTRING (start));
7041 }
7042 else
7043 {
7044 CHECK_NUMBER_COERCE_MARKER (start);
7045 CHECK_NUMBER_COERCE_MARKER (end);
7046 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7047 args_out_of_range (start, end);
7048 if (NILP (current_buffer->enable_multibyte_characters))
7049 return Qt;
7050 start_byte = CHAR_TO_BYTE (XINT (start));
7051 end_byte = CHAR_TO_BYTE (XINT (end));
7052 if (XINT (end) - XINT (start) == end_byte - start_byte)
7053 return Qt;
7054
7055 if (XINT (start) < GPT && XINT (end) > GPT)
7056 {
7057 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7058 move_gap_both (XINT (start), start_byte);
7059 else
7060 move_gap_both (XINT (end), end_byte);
7061 }
7062 }
7063
7064 coding_attrs_list = Qnil;
7065 for (tail = Vcoding_system_list; CONSP (tail); tail = XCDR (tail))
7066 if (NILP (exclude)
7067 || NILP (Fmemq (XCAR (tail), exclude)))
7068 {
7069 Lisp_Object attrs;
7070
7071 attrs = AREF (CODING_SYSTEM_SPEC (XCAR (tail)), 0);
7072 if (EQ (XCAR (tail), CODING_ATTR_BASE_NAME (attrs))
7073 && ! EQ (CODING_ATTR_TYPE (attrs), Qundecided))
7074 coding_attrs_list = Fcons (attrs, coding_attrs_list);
7075 }
7076
7077 if (STRINGP (start))
7078 p = pbeg = XSTRING (start)->data;
7079 else
7080 p = pbeg = BYTE_POS_ADDR (start_byte);
7081 pend = p + (end_byte - start_byte);
7082
7083 while (p < pend && ASCII_BYTE_P (*p)) p++;
7084 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7085
7086 while (p < pend)
7087 {
7088 if (ASCII_BYTE_P (*p))
7089 p++;
7090 else
7091 {
7092 c = STRING_CHAR_ADVANCE (p);
7093
7094 charset_map_loaded = 0;
7095 for (tail = coding_attrs_list; CONSP (tail);)
7096 {
7097 elt = XCAR (tail);
7098 if (NILP (elt))
7099 tail = XCDR (tail);
7100 else if (char_encodable_p (c, elt))
7101 tail = XCDR (tail);
7102 else if (CONSP (XCDR (tail)))
7103 {
7104 XSETCAR (tail, XCAR (XCDR (tail)));
7105 XSETCDR (tail, XCDR (XCDR (tail)));
7106 }
7107 else
7108 {
7109 XSETCAR (tail, Qnil);
7110 tail = XCDR (tail);
7111 }
7112 }
7113 if (charset_map_loaded)
7114 {
7115 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7116
7117 if (STRINGP (start))
7118 pbeg = XSTRING (start)->data;
7119 else
7120 pbeg = BYTE_POS_ADDR (start_byte);
7121 p = pbeg + p_offset;
7122 pend = pbeg + pend_offset;
7123 }
7124 }
7125 }
7126
7127 safe_codings = Qnil;
7128 for (tail = coding_attrs_list; CONSP (tail); tail = XCDR (tail))
7129 if (! NILP (XCAR (tail)))
7130 safe_codings = Fcons (CODING_ATTR_BASE_NAME (XCAR (tail)), safe_codings);
7131
7132 return safe_codings;
7133 }
7134
7135
7136 DEFUN ("check-coding-systems-region", Fcheck_coding_systems_region,
7137 Scheck_coding_systems_region, 3, 3, 0,
7138 doc: /* Check if the region is encodable by coding systems.
7139
7140 START and END are buffer positions specifying the region.
7141 CODING-SYSTEM-LIST is a list of coding systems to check.
7142
7143 The value is an alist ((CODING-SYSTEM POS0 POS1 ...) ...), where
7144 CODING-SYSTEM is a member of CODING-SYSTEM-LIst and can't encode the
7145 whole region, POS0, POS1, ... are buffer positions where non-encodable
7146 characters are found.
7147
7148 If all coding systems in CODING-SYSTEM-LIST can encode the region, the
7149 value is nil.
7150
7151 START may be a string. In that case, check if the string is
7152 encodable, and the value contains indices to the string instead of
7153 buffer positions. END is ignored. */)
7154 (start, end, coding_system_list)
7155 Lisp_Object start, end, coding_system_list;
7156 {
7157 Lisp_Object list;
7158 EMACS_INT start_byte, end_byte;
7159 int pos;
7160 const unsigned char *p, *pbeg, *pend;
7161 int c;
7162 Lisp_Object tail, elt;
7163
7164 if (STRINGP (start))
7165 {
7166 if (!STRING_MULTIBYTE (start)
7167 && XSTRING (start)->size != STRING_BYTES (XSTRING (start)))
7168 return Qnil;
7169 start_byte = 0;
7170 end_byte = STRING_BYTES (XSTRING (start));
7171 pos = 0;
7172 }
7173 else
7174 {
7175 CHECK_NUMBER_COERCE_MARKER (start);
7176 CHECK_NUMBER_COERCE_MARKER (end);
7177 if (XINT (start) < BEG || XINT (end) > Z || XINT (start) > XINT (end))
7178 args_out_of_range (start, end);
7179 if (NILP (current_buffer->enable_multibyte_characters))
7180 return Qnil;
7181 start_byte = CHAR_TO_BYTE (XINT (start));
7182 end_byte = CHAR_TO_BYTE (XINT (end));
7183 if (XINT (end) - XINT (start) == end_byte - start_byte)
7184 return Qt;
7185
7186 if (XINT (start) < GPT && XINT (end) > GPT)
7187 {
7188 if ((GPT - XINT (start)) < (XINT (end) - GPT))
7189 move_gap_both (XINT (start), start_byte);
7190 else
7191 move_gap_both (XINT (end), end_byte);
7192 }
7193 pos = XINT (start);
7194 }
7195
7196 list = Qnil;
7197 for (tail = coding_system_list; CONSP (tail); tail = XCDR (tail))
7198 {
7199 elt = XCAR (tail);
7200 list = Fcons (Fcons (elt, Fcons (AREF (CODING_SYSTEM_SPEC (elt), 0),
7201 Qnil)),
7202 list);
7203 }
7204
7205 if (STRINGP (start))
7206 p = pbeg = XSTRING (start)->data;
7207 else
7208 p = pbeg = BYTE_POS_ADDR (start_byte);
7209 pend = p + (end_byte - start_byte);
7210
7211 while (p < pend && ASCII_BYTE_P (*p)) p++, pos++;
7212 while (p < pend && ASCII_BYTE_P (*(pend - 1))) pend--;
7213
7214 while (p < pend)
7215 {
7216 if (ASCII_BYTE_P (*p))
7217 p++;
7218 else
7219 {
7220 c = STRING_CHAR_ADVANCE (p);
7221
7222 charset_map_loaded = 0;
7223 for (tail = list; CONSP (tail); tail = XCDR (tail))
7224 {
7225 elt = XCDR (XCAR (tail));
7226 if (! char_encodable_p (c, XCAR (elt)))
7227 XSETCDR (elt, Fcons (make_number (pos), XCDR (elt)));
7228 }
7229 if (charset_map_loaded)
7230 {
7231 EMACS_INT p_offset = p - pbeg, pend_offset = pend - pbeg;
7232
7233 if (STRINGP (start))
7234 pbeg = XSTRING (start)->data;
7235 else
7236 pbeg = BYTE_POS_ADDR (start_byte);
7237 p = pbeg + p_offset;
7238 pend = pbeg + pend_offset;
7239 }
7240 }
7241 pos++;
7242 }
7243
7244 tail = list;
7245 list = Qnil;
7246 for (; CONSP (tail); tail = XCDR (tail))
7247 {
7248 elt = XCAR (tail);
7249 if (CONSP (XCDR (XCDR (elt))))
7250 list = Fcons (Fcons (XCAR (elt), Fnreverse (XCDR (XCDR (elt)))),
7251 list);
7252 }
7253
7254 return list;
7255 }
7256
7257
7258
7259 Lisp_Object
7260 code_convert_region (start, end, coding_system, dst_object, encodep, norecord)
7261 Lisp_Object start, end, coding_system, dst_object;
7262 int encodep, norecord;
7263 {
7264 struct coding_system coding;
7265 EMACS_INT from, from_byte, to, to_byte;
7266 Lisp_Object src_object;
7267
7268 CHECK_NUMBER_COERCE_MARKER (start);
7269 CHECK_NUMBER_COERCE_MARKER (end);
7270 if (NILP (coding_system))
7271 coding_system = Qno_conversion;
7272 else
7273 CHECK_CODING_SYSTEM (coding_system);
7274 src_object = Fcurrent_buffer ();
7275 if (NILP (dst_object))
7276 dst_object = src_object;
7277 else if (! EQ (dst_object, Qt))
7278 CHECK_BUFFER (dst_object);
7279
7280 validate_region (&start, &end);
7281 from = XFASTINT (start);
7282 from_byte = CHAR_TO_BYTE (from);
7283 to = XFASTINT (end);
7284 to_byte = CHAR_TO_BYTE (to);
7285
7286 setup_coding_system (coding_system, &coding);
7287 coding.mode |= CODING_MODE_LAST_BLOCK;
7288
7289 if (encodep)
7290 encode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7291 dst_object);
7292 else
7293 decode_coding_object (&coding, src_object, from, from_byte, to, to_byte,
7294 dst_object);
7295 if (! norecord)
7296 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7297
7298 if (coding.result != CODING_RESULT_SUCCESS)
7299 error ("Code conversion error: %d", coding.result);
7300
7301 return (BUFFERP (dst_object)
7302 ? make_number (coding.produced_char)
7303 : coding.dst_object);
7304 }
7305
7306
7307 DEFUN ("decode-coding-region", Fdecode_coding_region, Sdecode_coding_region,
7308 3, 4, "r\nzCoding system: ",
7309 doc: /* Decode the current region from the specified coding system.
7310 When called from a program, takes four arguments:
7311 START, END, CODING-SYSTEM, and DESTINATION.
7312 START and END are buffer positions.
7313
7314 Optional 4th arguments DESTINATION specifies where the decoded text goes.
7315 If nil, the region between START and END is replace by the decoded text.
7316 If buffer, the decoded text is inserted in the buffer.
7317 If t, the decoded text is returned.
7318
7319 This function sets `last-coding-system-used' to the precise coding system
7320 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7321 not fully specified.)
7322 It returns the length of the decoded text. */)
7323 (start, end, coding_system, destination)
7324 Lisp_Object start, end, coding_system, destination;
7325 {
7326 return code_convert_region (start, end, coding_system, destination, 0, 0);
7327 }
7328
7329 DEFUN ("encode-coding-region", Fencode_coding_region, Sencode_coding_region,
7330 3, 4, "r\nzCoding system: ",
7331 doc: /* Encode the current region by specified coding system.
7332 When called from a program, takes three arguments:
7333 START, END, and CODING-SYSTEM. START and END are buffer positions.
7334
7335 Optional 4th arguments DESTINATION specifies where the encoded text goes.
7336 If nil, the region between START and END is replace by the encoded text.
7337 If buffer, the encoded text is inserted in the buffer.
7338 If t, the encoded text is returned.
7339
7340 This function sets `last-coding-system-used' to the precise coding system
7341 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7342 not fully specified.)
7343 It returns the length of the encoded text. */)
7344 (start, end, coding_system, destination)
7345 Lisp_Object start, end, coding_system, destination;
7346 {
7347 return code_convert_region (start, end, coding_system, destination, 1, 0);
7348 }
7349
7350 Lisp_Object
7351 code_convert_string (string, coding_system, dst_object,
7352 encodep, nocopy, norecord)
7353 Lisp_Object string, coding_system, dst_object;
7354 int encodep, nocopy, norecord;
7355 {
7356 struct coding_system coding;
7357 EMACS_INT chars, bytes;
7358
7359 CHECK_STRING (string);
7360 if (NILP (coding_system))
7361 {
7362 if (! norecord)
7363 Vlast_coding_system_used = Qno_conversion;
7364 if (NILP (dst_object))
7365 return (nocopy ? Fcopy_sequence (string) : string);
7366 }
7367
7368 if (NILP (coding_system))
7369 coding_system = Qno_conversion;
7370 else
7371 CHECK_CODING_SYSTEM (coding_system);
7372 if (NILP (dst_object))
7373 dst_object = Qt;
7374 else if (! EQ (dst_object, Qt))
7375 CHECK_BUFFER (dst_object);
7376
7377 setup_coding_system (coding_system, &coding);
7378 coding.mode |= CODING_MODE_LAST_BLOCK;
7379 chars = XSTRING (string)->size;
7380 bytes = STRING_BYTES (XSTRING (string));
7381 if (encodep)
7382 encode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7383 else
7384 decode_coding_object (&coding, string, 0, 0, chars, bytes, dst_object);
7385 if (! norecord)
7386 Vlast_coding_system_used = CODING_ID_NAME (coding.id);
7387
7388 if (coding.result != CODING_RESULT_SUCCESS)
7389 error ("Code conversion error: %d", coding.result);
7390
7391 return (BUFFERP (dst_object)
7392 ? make_number (coding.produced_char)
7393 : coding.dst_object);
7394 }
7395
7396
7397 /* Encode or decode STRING according to CODING_SYSTEM.
7398 Do not set Vlast_coding_system_used.
7399
7400 This function is called only from macros DECODE_FILE and
7401 ENCODE_FILE, thus we ignore character composition. */
7402
7403 Lisp_Object
7404 code_convert_string_norecord (string, coding_system, encodep)
7405 Lisp_Object string, coding_system;
7406 int encodep;
7407 {
7408 return code_convert_string (string, coding_system, Qt, encodep, 0, 1);
7409 }
7410
7411
7412 DEFUN ("decode-coding-string", Fdecode_coding_string, Sdecode_coding_string,
7413 2, 4, 0,
7414 doc: /* Decode STRING which is encoded in CODING-SYSTEM, and return the result.
7415
7416 Optional third arg NOCOPY non-nil means it is OK to return STRING itself
7417 if the decoding operation is trivial.
7418
7419 Optional fourth arg BUFFER non-nil meant that the decoded text is
7420 inserted in BUFFER instead of returned as a string. In this case,
7421 the return value is BUFFER.
7422
7423 This function sets `last-coding-system-used' to the precise coding system
7424 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7425 not fully specified. */)
7426 (string, coding_system, nocopy, buffer)
7427 Lisp_Object string, coding_system, nocopy, buffer;
7428 {
7429 return code_convert_string (string, coding_system, buffer,
7430 0, ! NILP (nocopy), 0);
7431 }
7432
7433 DEFUN ("encode-coding-string", Fencode_coding_string, Sencode_coding_string,
7434 2, 4, 0,
7435 doc: /* Encode STRING to CODING-SYSTEM, and return the result.
7436
7437 Optional third arg NOCOPY non-nil means it is OK to return STRING
7438 itself if the encoding operation is trivial.
7439
7440 Optional fourth arg BUFFER non-nil meant that the encoded text is
7441 inserted in BUFFER instead of returned as a string. In this case,
7442 the return value is BUFFER.
7443
7444 This function sets `last-coding-system-used' to the precise coding system
7445 used (which may be different from CODING-SYSTEM if CODING-SYSTEM is
7446 not fully specified.) */)
7447 (string, coding_system, nocopy, buffer)
7448 Lisp_Object string, coding_system, nocopy, buffer;
7449 {
7450 return code_convert_string (string, coding_system, buffer,
7451 1, ! NILP (nocopy), 1);
7452 }
7453
7454 \f
7455 DEFUN ("decode-sjis-char", Fdecode_sjis_char, Sdecode_sjis_char, 1, 1, 0,
7456 doc: /* Decode a Japanese character which has CODE in shift_jis encoding.
7457 Return the corresponding character. */)
7458 (code)
7459 Lisp_Object code;
7460 {
7461 Lisp_Object spec, attrs, val;
7462 struct charset *charset_roman, *charset_kanji, *charset_kana, *charset;
7463 int c;
7464
7465 CHECK_NATNUM (code);
7466 c = XFASTINT (code);
7467 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7468 attrs = AREF (spec, 0);
7469
7470 if (ASCII_BYTE_P (c)
7471 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7472 return code;
7473
7474 val = CODING_ATTR_CHARSET_LIST (attrs);
7475 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7476 charset_kana = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7477 charset_kanji = CHARSET_FROM_ID (XINT (XCAR (val)));
7478
7479 if (c <= 0x7F)
7480 charset = charset_roman;
7481 else if (c >= 0xA0 && c < 0xDF)
7482 {
7483 charset = charset_kana;
7484 c -= 0x80;
7485 }
7486 else
7487 {
7488 int s1 = c >> 8, s2 = c & 0xFF;
7489
7490 if (s1 < 0x81 || (s1 > 0x9F && s1 < 0xE0) || s1 > 0xEF
7491 || s2 < 0x40 || s2 == 0x7F || s2 > 0xFC)
7492 error ("Invalid code: %d", code);
7493 SJIS_TO_JIS (c);
7494 charset = charset_kanji;
7495 }
7496 c = DECODE_CHAR (charset, c);
7497 if (c < 0)
7498 error ("Invalid code: %d", code);
7499 return make_number (c);
7500 }
7501
7502
7503 DEFUN ("encode-sjis-char", Fencode_sjis_char, Sencode_sjis_char, 1, 1, 0,
7504 doc: /* Encode a Japanese character CHAR to shift_jis encoding.
7505 Return the corresponding code in SJIS. */)
7506 (ch)
7507 Lisp_Object ch;
7508 {
7509 Lisp_Object spec, attrs, charset_list;
7510 int c;
7511 struct charset *charset;
7512 unsigned code;
7513
7514 CHECK_CHARACTER (ch);
7515 c = XFASTINT (ch);
7516 CHECK_CODING_SYSTEM_GET_SPEC (Vsjis_coding_system, spec);
7517 attrs = AREF (spec, 0);
7518
7519 if (ASCII_CHAR_P (c)
7520 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7521 return ch;
7522
7523 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7524 charset = char_charset (c, charset_list, &code);
7525 if (code == CHARSET_INVALID_CODE (charset))
7526 error ("Can't encode by shift_jis encoding: %d", c);
7527 JIS_TO_SJIS (code);
7528
7529 return make_number (code);
7530 }
7531
7532 DEFUN ("decode-big5-char", Fdecode_big5_char, Sdecode_big5_char, 1, 1, 0,
7533 doc: /* Decode a Big5 character which has CODE in BIG5 coding system.
7534 Return the corresponding character. */)
7535 (code)
7536 Lisp_Object code;
7537 {
7538 Lisp_Object spec, attrs, val;
7539 struct charset *charset_roman, *charset_big5, *charset;
7540 int c;
7541
7542 CHECK_NATNUM (code);
7543 c = XFASTINT (code);
7544 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7545 attrs = AREF (spec, 0);
7546
7547 if (ASCII_BYTE_P (c)
7548 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7549 return code;
7550
7551 val = CODING_ATTR_CHARSET_LIST (attrs);
7552 charset_roman = CHARSET_FROM_ID (XINT (XCAR (val))), val = XCDR (val);
7553 charset_big5 = CHARSET_FROM_ID (XINT (XCAR (val)));
7554
7555 if (c <= 0x7F)
7556 charset = charset_roman;
7557 else
7558 {
7559 int b1 = c >> 8, b2 = c & 0x7F;
7560 if (b1 < 0xA1 || b1 > 0xFE
7561 || b2 < 0x40 || (b2 > 0x7E && b2 < 0xA1) || b2 > 0xFE)
7562 error ("Invalid code: %d", code);
7563 charset = charset_big5;
7564 }
7565 c = DECODE_CHAR (charset, (unsigned )c);
7566 if (c < 0)
7567 error ("Invalid code: %d", code);
7568 return make_number (c);
7569 }
7570
7571 DEFUN ("encode-big5-char", Fencode_big5_char, Sencode_big5_char, 1, 1, 0,
7572 doc: /* Encode the Big5 character CHAR to BIG5 coding system.
7573 Return the corresponding character code in Big5. */)
7574 (ch)
7575 Lisp_Object ch;
7576 {
7577 Lisp_Object spec, attrs, charset_list;
7578 struct charset *charset;
7579 int c;
7580 unsigned code;
7581
7582 CHECK_CHARACTER (ch);
7583 c = XFASTINT (ch);
7584 CHECK_CODING_SYSTEM_GET_SPEC (Vbig5_coding_system, spec);
7585 attrs = AREF (spec, 0);
7586 if (ASCII_CHAR_P (c)
7587 && ! NILP (CODING_ATTR_ASCII_COMPAT (attrs)))
7588 return ch;
7589
7590 charset_list = CODING_ATTR_CHARSET_LIST (attrs);
7591 charset = char_charset (c, charset_list, &code);
7592 if (code == CHARSET_INVALID_CODE (charset))
7593 error ("Can't encode by Big5 encoding: %d", c);
7594
7595 return make_number (code);
7596 }
7597
7598 \f
7599 DEFUN ("set-terminal-coding-system-internal",
7600 Fset_terminal_coding_system_internal,
7601 Sset_terminal_coding_system_internal, 1, 1, 0,
7602 doc: /* Internal use only. */)
7603 (coding_system)
7604 Lisp_Object coding_system;
7605 {
7606 CHECK_SYMBOL (coding_system);
7607 setup_coding_system (Fcheck_coding_system (coding_system),
7608 &terminal_coding);
7609
7610 /* We had better not send unsafe characters to terminal. */
7611 terminal_coding.mode |= CODING_MODE_SAFE_ENCODING;
7612 /* Characer composition should be disabled. */
7613 terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7614 terminal_coding.src_multibyte = 1;
7615 terminal_coding.dst_multibyte = 0;
7616 return Qnil;
7617 }
7618
7619 DEFUN ("set-safe-terminal-coding-system-internal",
7620 Fset_safe_terminal_coding_system_internal,
7621 Sset_safe_terminal_coding_system_internal, 1, 1, 0,
7622 doc: /* Internal use only. */)
7623 (coding_system)
7624 Lisp_Object coding_system;
7625 {
7626 CHECK_SYMBOL (coding_system);
7627 setup_coding_system (Fcheck_coding_system (coding_system),
7628 &safe_terminal_coding);
7629 /* Characer composition should be disabled. */
7630 safe_terminal_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7631 safe_terminal_coding.src_multibyte = 1;
7632 safe_terminal_coding.dst_multibyte = 0;
7633 return Qnil;
7634 }
7635
7636 DEFUN ("terminal-coding-system",
7637 Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0,
7638 doc: /* Return coding system specified for terminal output. */)
7639 ()
7640 {
7641 return CODING_ID_NAME (terminal_coding.id);
7642 }
7643
7644 DEFUN ("set-keyboard-coding-system-internal",
7645 Fset_keyboard_coding_system_internal,
7646 Sset_keyboard_coding_system_internal, 1, 1, 0,
7647 doc: /* Internal use only. */)
7648 (coding_system)
7649 Lisp_Object coding_system;
7650 {
7651 CHECK_SYMBOL (coding_system);
7652 setup_coding_system (Fcheck_coding_system (coding_system),
7653 &keyboard_coding);
7654 /* Characer composition should be disabled. */
7655 keyboard_coding.common_flags &= ~CODING_ANNOTATE_COMPOSITION_MASK;
7656 return Qnil;
7657 }
7658
7659 DEFUN ("keyboard-coding-system",
7660 Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0,
7661 doc: /* Return coding system specified for decoding keyboard input. */)
7662 ()
7663 {
7664 return CODING_ID_NAME (keyboard_coding.id);
7665 }
7666
7667 \f
7668 DEFUN ("find-operation-coding-system", Ffind_operation_coding_system,
7669 Sfind_operation_coding_system, 1, MANY, 0,
7670 doc: /* Choose a coding system for an operation based on the target name.
7671 The value names a pair of coding systems: (DECODING-SYSTEM . ENCODING-SYSTEM).
7672 DECODING-SYSTEM is the coding system to use for decoding
7673 \(in case OPERATION does decoding), and ENCODING-SYSTEM is the coding system
7674 for encoding (in case OPERATION does encoding).
7675
7676 The first argument OPERATION specifies an I/O primitive:
7677 For file I/O, `insert-file-contents' or `write-region'.
7678 For process I/O, `call-process', `call-process-region', or `start-process'.
7679 For network I/O, `open-network-stream'.
7680
7681 The remaining arguments should be the same arguments that were passed
7682 to the primitive. Depending on which primitive, one of those arguments
7683 is selected as the TARGET. For example, if OPERATION does file I/O,
7684 whichever argument specifies the file name is TARGET.
7685
7686 TARGET has a meaning which depends on OPERATION:
7687 For file I/O, TARGET is a file name.
7688 For process I/O, TARGET is a process name.
7689 For network I/O, TARGET is a service name or a port number
7690
7691 This function looks up what specified for TARGET in,
7692 `file-coding-system-alist', `process-coding-system-alist',
7693 or `network-coding-system-alist' depending on OPERATION.
7694 They may specify a coding system, a cons of coding systems,
7695 or a function symbol to call.
7696 In the last case, we call the function with one argument,
7697 which is a list of all the arguments given to this function.
7698
7699 usage: (find-operation-coding-system OPERATION ARGUMENTS ...) */)
7700 (nargs, args)
7701 int nargs;
7702 Lisp_Object *args;
7703 {
7704 Lisp_Object operation, target_idx, target, val;
7705 register Lisp_Object chain;
7706
7707 if (nargs < 2)
7708 error ("Too few arguments");
7709 operation = args[0];
7710 if (!SYMBOLP (operation)
7711 || !INTEGERP (target_idx = Fget (operation, Qtarget_idx)))
7712 error ("Invalid first arguement");
7713 if (nargs < 1 + XINT (target_idx))
7714 error ("Too few arguments for operation: %s",
7715 XSYMBOL (operation)->name->data);
7716 target = args[XINT (target_idx) + 1];
7717 if (!(STRINGP (target)
7718 || (EQ (operation, Qopen_network_stream) && INTEGERP (target))))
7719 error ("Invalid %dth argument", XINT (target_idx) + 1);
7720
7721 chain = ((EQ (operation, Qinsert_file_contents)
7722 || EQ (operation, Qwrite_region))
7723 ? Vfile_coding_system_alist
7724 : (EQ (operation, Qopen_network_stream)
7725 ? Vnetwork_coding_system_alist
7726 : Vprocess_coding_system_alist));
7727 if (NILP (chain))
7728 return Qnil;
7729
7730 for (; CONSP (chain); chain = XCDR (chain))
7731 {
7732 Lisp_Object elt;
7733
7734 elt = XCAR (chain);
7735 if (CONSP (elt)
7736 && ((STRINGP (target)
7737 && STRINGP (XCAR (elt))
7738 && fast_string_match (XCAR (elt), target) >= 0)
7739 || (INTEGERP (target) && EQ (target, XCAR (elt)))))
7740 {
7741 val = XCDR (elt);
7742 /* Here, if VAL is both a valid coding system and a valid
7743 function symbol, we return VAL as a coding system. */
7744 if (CONSP (val))
7745 return val;
7746 if (! SYMBOLP (val))
7747 return Qnil;
7748 if (! NILP (Fcoding_system_p (val)))
7749 return Fcons (val, val);
7750 if (! NILP (Ffboundp (val)))
7751 {
7752 val = call1 (val, Flist (nargs, args));
7753 if (CONSP (val))
7754 return val;
7755 if (SYMBOLP (val) && ! NILP (Fcoding_system_p (val)))
7756 return Fcons (val, val);
7757 }
7758 return Qnil;
7759 }
7760 }
7761 return Qnil;
7762 }
7763
7764 DEFUN ("set-coding-system-priority", Fset_coding_system_priority,
7765 Sset_coding_system_priority, 0, MANY, 0,
7766 doc: /* Assign higher priority to the coding systems given as arguments.
7767 If multiple coding systems belongs to the same category,
7768 all but the first one are ignored. */)
7769 (nargs, args)
7770 int nargs;
7771 Lisp_Object *args;
7772 {
7773 int i, j;
7774 int changed[coding_category_max];
7775 enum coding_category priorities[coding_category_max];
7776
7777 bzero (changed, sizeof changed);
7778
7779 for (i = j = 0; i < nargs; i++)
7780 {
7781 enum coding_category category;
7782 Lisp_Object spec, attrs;
7783
7784 CHECK_CODING_SYSTEM_GET_SPEC (args[i], spec);
7785 attrs = AREF (spec, 0);
7786 category = XINT (CODING_ATTR_CATEGORY (attrs));
7787 if (changed[category])
7788 /* Ignore this coding system because a coding system of the
7789 same category already had a higher priority. */
7790 continue;
7791 changed[category] = 1;
7792 priorities[j++] = category;
7793 if (coding_categories[category].id >= 0
7794 && ! EQ (args[i], CODING_ID_NAME (coding_categories[category].id)))
7795 setup_coding_system (args[i], &coding_categories[category]);
7796 Fset (AREF (Vcoding_category_table, category), args[i]);
7797 }
7798
7799 /* Now we have decided top J priorities. Reflect the order of the
7800 original priorities to the remaining priorities. */
7801
7802 for (i = j, j = 0; i < coding_category_max; i++, j++)
7803 {
7804 while (j < coding_category_max
7805 && changed[coding_priorities[j]])
7806 j++;
7807 if (j == coding_category_max)
7808 abort ();
7809 priorities[i] = coding_priorities[j];
7810 }
7811
7812 bcopy (priorities, coding_priorities, sizeof priorities);
7813
7814 /* Update `coding-category-list'. */
7815 Vcoding_category_list = Qnil;
7816 for (i = coding_category_max - 1; i >= 0; i--)
7817 Vcoding_category_list
7818 = Fcons (AREF (Vcoding_category_table, priorities[i]),
7819 Vcoding_category_list);
7820
7821 return Qnil;
7822 }
7823
7824 DEFUN ("coding-system-priority-list", Fcoding_system_priority_list,
7825 Scoding_system_priority_list, 0, 1, 0,
7826 doc: /* Return a list of coding systems ordered by their priorities.
7827 HIGHESTP non-nil means just return the highest priority one. */)
7828 (highestp)
7829 Lisp_Object highestp;
7830 {
7831 int i;
7832 Lisp_Object val;
7833
7834 for (i = 0, val = Qnil; i < coding_category_max; i++)
7835 {
7836 enum coding_category category = coding_priorities[i];
7837 int id = coding_categories[category].id;
7838 Lisp_Object attrs;
7839
7840 if (id < 0)
7841 continue;
7842 attrs = CODING_ID_ATTRS (id);
7843 if (! NILP (highestp))
7844 return CODING_ATTR_BASE_NAME (attrs);
7845 val = Fcons (CODING_ATTR_BASE_NAME (attrs), val);
7846 }
7847 return Fnreverse (val);
7848 }
7849
7850 static char *suffixes[] = { "-unix", "-dos", "-mac" };
7851
7852 static Lisp_Object
7853 make_subsidiaries (base)
7854 Lisp_Object base;
7855 {
7856 Lisp_Object subsidiaries;
7857 int base_name_len = STRING_BYTES (XSYMBOL (base)->name);
7858 char *buf = (char *) alloca (base_name_len + 6);
7859 int i;
7860
7861 bcopy (XSYMBOL (base)->name->data, buf, base_name_len);
7862 subsidiaries = Fmake_vector (make_number (3), Qnil);
7863 for (i = 0; i < 3; i++)
7864 {
7865 bcopy (suffixes[i], buf + base_name_len, strlen (suffixes[i]) + 1);
7866 ASET (subsidiaries, i, intern (buf));
7867 }
7868 return subsidiaries;
7869 }
7870
7871
7872 DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal,
7873 Sdefine_coding_system_internal, coding_arg_max, MANY, 0,
7874 doc: /* For internal use only.
7875 usage: (define-coding-system-internal ...) */)
7876 (nargs, args)
7877 int nargs;
7878 Lisp_Object *args;
7879 {
7880 Lisp_Object name;
7881 Lisp_Object spec_vec; /* [ ATTRS ALIASE EOL_TYPE ] */
7882 Lisp_Object attrs; /* Vector of attributes. */
7883 Lisp_Object eol_type;
7884 Lisp_Object aliases;
7885 Lisp_Object coding_type, charset_list, safe_charsets;
7886 enum coding_category category;
7887 Lisp_Object tail, val;
7888 int max_charset_id = 0;
7889 int i;
7890
7891 if (nargs < coding_arg_max)
7892 goto short_args;
7893
7894 attrs = Fmake_vector (make_number (coding_attr_last_index), Qnil);
7895
7896 name = args[coding_arg_name];
7897 CHECK_SYMBOL (name);
7898 CODING_ATTR_BASE_NAME (attrs) = name;
7899
7900 val = args[coding_arg_mnemonic];
7901 if (! STRINGP (val))
7902 CHECK_CHARACTER (val);
7903 CODING_ATTR_MNEMONIC (attrs) = val;
7904
7905 coding_type = args[coding_arg_coding_type];
7906 CHECK_SYMBOL (coding_type);
7907 CODING_ATTR_TYPE (attrs) = coding_type;
7908
7909 charset_list = args[coding_arg_charset_list];
7910 if (SYMBOLP (charset_list))
7911 {
7912 if (EQ (charset_list, Qiso_2022))
7913 {
7914 if (! EQ (coding_type, Qiso_2022))
7915 error ("Invalid charset-list");
7916 charset_list = Viso_2022_charset_list;
7917 }
7918 else if (EQ (charset_list, Qemacs_mule))
7919 {
7920 if (! EQ (coding_type, Qemacs_mule))
7921 error ("Invalid charset-list");
7922 charset_list = Vemacs_mule_charset_list;
7923 }
7924 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7925 if (max_charset_id < XFASTINT (XCAR (tail)))
7926 max_charset_id = XFASTINT (XCAR (tail));
7927 }
7928 else
7929 {
7930 charset_list = Fcopy_sequence (charset_list);
7931 for (tail = charset_list; !NILP (tail); tail = Fcdr (tail))
7932 {
7933 struct charset *charset;
7934
7935 val = Fcar (tail);
7936 CHECK_CHARSET_GET_CHARSET (val, charset);
7937 if (EQ (coding_type, Qiso_2022)
7938 ? CHARSET_ISO_FINAL (charset) < 0
7939 : EQ (coding_type, Qemacs_mule)
7940 ? CHARSET_EMACS_MULE_ID (charset) < 0
7941 : 0)
7942 error ("Can't handle charset `%s'",
7943 XSYMBOL (CHARSET_NAME (charset))->name->data);
7944
7945 XCAR (tail) = make_number (charset->id);
7946 if (max_charset_id < charset->id)
7947 max_charset_id = charset->id;
7948 }
7949 }
7950 CODING_ATTR_CHARSET_LIST (attrs) = charset_list;
7951
7952 safe_charsets = Fmake_string (make_number (max_charset_id + 1),
7953 make_number (255));
7954 for (tail = charset_list; CONSP (tail); tail = XCDR (tail))
7955 XSTRING (safe_charsets)->data[XFASTINT (XCAR (tail))] = 0;
7956 CODING_ATTR_SAFE_CHARSETS (attrs) = safe_charsets;
7957
7958 CODING_ATTR_ASCII_COMPAT (attrs) = args[coding_arg_ascii_compatible_p];
7959
7960 val = args[coding_arg_decode_translation_table];
7961 if (! NILP (val))
7962 CHECK_CHAR_TABLE (val);
7963 CODING_ATTR_DECODE_TBL (attrs) = val;
7964
7965 val = args[coding_arg_encode_translation_table];
7966 if (! NILP (val))
7967 CHECK_CHAR_TABLE (val);
7968 CODING_ATTR_ENCODE_TBL (attrs) = val;
7969
7970 val = args[coding_arg_post_read_conversion];
7971 CHECK_SYMBOL (val);
7972 CODING_ATTR_POST_READ (attrs) = val;
7973
7974 val = args[coding_arg_pre_write_conversion];
7975 CHECK_SYMBOL (val);
7976 CODING_ATTR_PRE_WRITE (attrs) = val;
7977
7978 val = args[coding_arg_default_char];
7979 if (NILP (val))
7980 CODING_ATTR_DEFAULT_CHAR (attrs) = make_number (' ');
7981 else
7982 {
7983 CHECK_CHARACTER (val);
7984 CODING_ATTR_DEFAULT_CHAR (attrs) = val;
7985 }
7986
7987 val = args[coding_arg_plist];
7988 CHECK_LIST (val);
7989 CODING_ATTR_PLIST (attrs) = val;
7990
7991 if (EQ (coding_type, Qcharset))
7992 {
7993 Lisp_Object list;
7994 /* Generate a lisp vector of 256 elements. Each element is nil,
7995 integer, or a list of charset IDs.
7996
7997 If Nth element is nil, the byte code N is invalid in this
7998 coding system.
7999
8000 If Nth element is a number NUM, N is the first byte of a
8001 charset whose ID is NUM.
8002
8003 If Nth element is a list of charset IDs, N is the first byte
8004 of one of them. The list is sorted by dimensions of the
8005 charsets. A charset of smaller dimension comes firtst.
8006 */
8007 for (list = Qnil, tail = charset_list; CONSP (tail); tail = XCDR (tail))
8008 {
8009 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8010
8011 if (charset->method == CHARSET_METHOD_SUPERSET)
8012 {
8013 val = CHARSET_SUPERSET (charset);
8014 for (; CONSP (val); val = XCDR (val))
8015 list = Fcons (XCAR (XCAR (val)), list);
8016 }
8017 else
8018 list = Fcons (XCAR (tail), list);
8019 }
8020
8021 val = Fmake_vector (make_number (256), Qnil);
8022
8023 for (tail = Fnreverse (list); CONSP (tail); tail = XCDR (tail))
8024 {
8025 struct charset *charset = CHARSET_FROM_ID (XFASTINT (XCAR (tail)));
8026 int dim = CHARSET_DIMENSION (charset);
8027 int idx = (dim - 1) * 4;
8028
8029 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8030 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8031
8032 for (i = charset->code_space[idx];
8033 i <= charset->code_space[idx + 1]; i++)
8034 {
8035 Lisp_Object tmp, tmp2;
8036 int dim2;
8037
8038 tmp = AREF (val, i);
8039 if (NILP (tmp))
8040 tmp = XCAR (tail);
8041 else if (NUMBERP (tmp))
8042 {
8043 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (tmp)));
8044 if (dim < dim2)
8045 tmp = Fcons (XCAR (tail), Fcons (tmp, Qnil));
8046 else
8047 tmp = Fcons (tmp, Fcons (XCAR (tail), Qnil));
8048 }
8049 else
8050 {
8051 for (tmp2 = tmp; CONSP (tmp2); tmp2 = XCDR (tmp2))
8052 {
8053 dim2 = CHARSET_DIMENSION (CHARSET_FROM_ID (XFASTINT (XCAR (tmp2))));
8054 if (dim < dim2)
8055 break;
8056 }
8057 if (NILP (tmp2))
8058 tmp = nconc2 (tmp, Fcons (XCAR (tail), Qnil));
8059 else
8060 {
8061 XSETCDR (tmp2, Fcons (XCAR (tmp2), XCDR (tmp2)));
8062 XSETCAR (tmp2, XCAR (tail));
8063 }
8064 }
8065 ASET (val, i, tmp);
8066 }
8067 }
8068 ASET (attrs, coding_attr_charset_valids, val);
8069 category = coding_category_charset;
8070 }
8071 else if (EQ (coding_type, Qccl))
8072 {
8073 Lisp_Object valids;
8074
8075 if (nargs < coding_arg_ccl_max)
8076 goto short_args;
8077
8078 val = args[coding_arg_ccl_decoder];
8079 CHECK_CCL_PROGRAM (val);
8080 if (VECTORP (val))
8081 val = Fcopy_sequence (val);
8082 ASET (attrs, coding_attr_ccl_decoder, val);
8083
8084 val = args[coding_arg_ccl_encoder];
8085 CHECK_CCL_PROGRAM (val);
8086 if (VECTORP (val))
8087 val = Fcopy_sequence (val);
8088 ASET (attrs, coding_attr_ccl_encoder, val);
8089
8090 val = args[coding_arg_ccl_valids];
8091 valids = Fmake_string (make_number (256), make_number (0));
8092 for (tail = val; !NILP (tail); tail = Fcdr (tail))
8093 {
8094 int from, to;
8095
8096 val = Fcar (tail);
8097 if (INTEGERP (val))
8098 {
8099 from = to = XINT (val);
8100 if (from < 0 || from > 255)
8101 args_out_of_range_3 (val, make_number (0), make_number (255));
8102 }
8103 else
8104 {
8105 CHECK_CONS (val);
8106 CHECK_NUMBER (XCAR (val));
8107 CHECK_NUMBER (XCDR (val));
8108 from = XINT (XCAR (val));
8109 if (from < 0 || from > 255)
8110 args_out_of_range_3 (XCAR (val),
8111 make_number (0), make_number (255));
8112 to = XINT (XCDR (val));
8113 if (to < from || to > 255)
8114 args_out_of_range_3 (XCDR (val),
8115 XCAR (val), make_number (255));
8116 }
8117 for (i = from; i <= to; i++)
8118 XSTRING (valids)->data[i] = 1;
8119 }
8120 ASET (attrs, coding_attr_ccl_valids, valids);
8121
8122 category = coding_category_ccl;
8123 }
8124 else if (EQ (coding_type, Qutf_16))
8125 {
8126 Lisp_Object bom, endian;
8127
8128 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8129
8130 if (nargs < coding_arg_utf16_max)
8131 goto short_args;
8132
8133 bom = args[coding_arg_utf16_bom];
8134 if (! NILP (bom) && ! EQ (bom, Qt))
8135 {
8136 CHECK_CONS (bom);
8137 CHECK_CODING_SYSTEM (XCAR (bom));
8138 CHECK_CODING_SYSTEM (XCDR (bom));
8139 }
8140 ASET (attrs, coding_attr_utf_16_bom, bom);
8141
8142 endian = args[coding_arg_utf16_endian];
8143 CHECK_SYMBOL (endian);
8144 if (NILP (endian))
8145 endian = Qbig;
8146 else if (! EQ (endian, Qbig) && ! EQ (endian, Qlittle))
8147 error ("Invalid endian: %s", XSYMBOL (endian)->name->data);
8148 ASET (attrs, coding_attr_utf_16_endian, endian);
8149
8150 category = (CONSP (bom)
8151 ? coding_category_utf_16_auto
8152 : NILP (bom)
8153 ? (EQ (endian, Qbig)
8154 ? coding_category_utf_16_be_nosig
8155 : coding_category_utf_16_le_nosig)
8156 : (EQ (endian, Qbig)
8157 ? coding_category_utf_16_be
8158 : coding_category_utf_16_le));
8159 }
8160 else if (EQ (coding_type, Qiso_2022))
8161 {
8162 Lisp_Object initial, reg_usage, request, flags;
8163 int i;
8164
8165 if (nargs < coding_arg_iso2022_max)
8166 goto short_args;
8167
8168 initial = Fcopy_sequence (args[coding_arg_iso2022_initial]);
8169 CHECK_VECTOR (initial);
8170 for (i = 0; i < 4; i++)
8171 {
8172 val = Faref (initial, make_number (i));
8173 if (! NILP (val))
8174 {
8175 struct charset *charset;
8176
8177 CHECK_CHARSET_GET_CHARSET (val, charset);
8178 ASET (initial, i, make_number (CHARSET_ID (charset)));
8179 if (i == 0 && CHARSET_ASCII_COMPATIBLE_P (charset))
8180 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8181 }
8182 else
8183 ASET (initial, i, make_number (-1));
8184 }
8185
8186 reg_usage = args[coding_arg_iso2022_reg_usage];
8187 CHECK_CONS (reg_usage);
8188 CHECK_NATNUM (XCAR (reg_usage));
8189 CHECK_NATNUM (XCDR (reg_usage));
8190
8191 request = Fcopy_sequence (args[coding_arg_iso2022_request]);
8192 for (tail = request; ! NILP (tail); tail = Fcdr (tail))
8193 {
8194 int id;
8195
8196 val = Fcar (tail);
8197 CHECK_CONS (val);
8198 CHECK_CHARSET_GET_ID (XCAR (val), id);
8199 CHECK_NATNUM (XCDR (val));
8200 if (XINT (XCDR (val)) >= 4)
8201 error ("Invalid graphic register number: %d", XINT (XCDR (val)));
8202 XCAR (val) = make_number (id);
8203 }
8204
8205 flags = args[coding_arg_iso2022_flags];
8206 CHECK_NATNUM (flags);
8207 i = XINT (flags);
8208 if (EQ (args[coding_arg_charset_list], Qiso_2022))
8209 flags = make_number (i | CODING_ISO_FLAG_FULL_SUPPORT);
8210
8211 ASET (attrs, coding_attr_iso_initial, initial);
8212 ASET (attrs, coding_attr_iso_usage, reg_usage);
8213 ASET (attrs, coding_attr_iso_request, request);
8214 ASET (attrs, coding_attr_iso_flags, flags);
8215 setup_iso_safe_charsets (attrs);
8216
8217 if (i & CODING_ISO_FLAG_SEVEN_BITS)
8218 category = ((i & (CODING_ISO_FLAG_LOCKING_SHIFT
8219 | CODING_ISO_FLAG_SINGLE_SHIFT))
8220 ? coding_category_iso_7_else
8221 : EQ (args[coding_arg_charset_list], Qiso_2022)
8222 ? coding_category_iso_7
8223 : coding_category_iso_7_tight);
8224 else
8225 {
8226 int id = XINT (AREF (initial, 1));
8227
8228 category = (((i & CODING_ISO_FLAG_LOCKING_SHIFT)
8229 || EQ (args[coding_arg_charset_list], Qiso_2022)
8230 || id < 0)
8231 ? coding_category_iso_8_else
8232 : (CHARSET_DIMENSION (CHARSET_FROM_ID (id)) == 1)
8233 ? coding_category_iso_8_1
8234 : coding_category_iso_8_2);
8235 }
8236 if (category != coding_category_iso_8_1
8237 && category != coding_category_iso_8_2)
8238 CODING_ATTR_ASCII_COMPAT (attrs) = Qnil;
8239 }
8240 else if (EQ (coding_type, Qemacs_mule))
8241 {
8242 if (EQ (args[coding_arg_charset_list], Qemacs_mule))
8243 ASET (attrs, coding_attr_emacs_mule_full, Qt);
8244 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8245 category = coding_category_emacs_mule;
8246 }
8247 else if (EQ (coding_type, Qshift_jis))
8248 {
8249
8250 struct charset *charset;
8251
8252 if (XINT (Flength (charset_list)) != 3)
8253 error ("There should be just three charsets");
8254
8255 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8256 if (CHARSET_DIMENSION (charset) != 1)
8257 error ("Dimension of charset %s is not one",
8258 XSYMBOL (CHARSET_NAME (charset))->name->data);
8259 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8260 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8261
8262 charset_list = XCDR (charset_list);
8263 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8264 if (CHARSET_DIMENSION (charset) != 1)
8265 error ("Dimension of charset %s is not one",
8266 XSYMBOL (CHARSET_NAME (charset))->name->data);
8267
8268 charset_list = XCDR (charset_list);
8269 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8270 if (CHARSET_DIMENSION (charset) != 2)
8271 error ("Dimension of charset %s is not two",
8272 XSYMBOL (CHARSET_NAME (charset))->name->data);
8273
8274 category = coding_category_sjis;
8275 Vsjis_coding_system = name;
8276 }
8277 else if (EQ (coding_type, Qbig5))
8278 {
8279 struct charset *charset;
8280
8281 if (XINT (Flength (charset_list)) != 2)
8282 error ("There should be just two charsets");
8283
8284 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8285 if (CHARSET_DIMENSION (charset) != 1)
8286 error ("Dimension of charset %s is not one",
8287 XSYMBOL (CHARSET_NAME (charset))->name->data);
8288 if (CHARSET_ASCII_COMPATIBLE_P (charset))
8289 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8290
8291 charset_list = XCDR (charset_list);
8292 charset = CHARSET_FROM_ID (XINT (XCAR (charset_list)));
8293 if (CHARSET_DIMENSION (charset) != 2)
8294 error ("Dimension of charset %s is not two",
8295 XSYMBOL (CHARSET_NAME (charset))->name->data);
8296
8297 category = coding_category_big5;
8298 Vbig5_coding_system = name;
8299 }
8300 else if (EQ (coding_type, Qraw_text))
8301 {
8302 category = coding_category_raw_text;
8303 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8304 }
8305 else if (EQ (coding_type, Qutf_8))
8306 {
8307 category = coding_category_utf_8;
8308 CODING_ATTR_ASCII_COMPAT (attrs) = Qt;
8309 }
8310 else if (EQ (coding_type, Qundecided))
8311 category = coding_category_undecided;
8312 else
8313 error ("Invalid coding system type: %s",
8314 XSYMBOL (coding_type)->name->data);
8315
8316 CODING_ATTR_CATEGORY (attrs) = make_number (category);
8317 CODING_ATTR_PLIST (attrs)
8318 = Fcons (QCcategory, Fcons (AREF (Vcoding_category_table, category),
8319 CODING_ATTR_PLIST (attrs)));
8320
8321 eol_type = args[coding_arg_eol_type];
8322 if (! NILP (eol_type)
8323 && ! EQ (eol_type, Qunix)
8324 && ! EQ (eol_type, Qdos)
8325 && ! EQ (eol_type, Qmac))
8326 error ("Invalid eol-type");
8327
8328 aliases = Fcons (name, Qnil);
8329
8330 if (NILP (eol_type))
8331 {
8332 eol_type = make_subsidiaries (name);
8333 for (i = 0; i < 3; i++)
8334 {
8335 Lisp_Object this_spec, this_name, this_aliases, this_eol_type;
8336
8337 this_name = AREF (eol_type, i);
8338 this_aliases = Fcons (this_name, Qnil);
8339 this_eol_type = (i == 0 ? Qunix : i == 1 ? Qdos : Qmac);
8340 this_spec = Fmake_vector (make_number (3), attrs);
8341 ASET (this_spec, 1, this_aliases);
8342 ASET (this_spec, 2, this_eol_type);
8343 Fputhash (this_name, this_spec, Vcoding_system_hash_table);
8344 Vcoding_system_list = Fcons (this_name, Vcoding_system_list);
8345 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (this_name), Qnil),
8346 Vcoding_system_alist);
8347 }
8348 }
8349
8350 spec_vec = Fmake_vector (make_number (3), attrs);
8351 ASET (spec_vec, 1, aliases);
8352 ASET (spec_vec, 2, eol_type);
8353
8354 Fputhash (name, spec_vec, Vcoding_system_hash_table);
8355 Vcoding_system_list = Fcons (name, Vcoding_system_list);
8356 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (name), Qnil),
8357 Vcoding_system_alist);
8358
8359 {
8360 int id = coding_categories[category].id;
8361
8362 if (id < 0 || EQ (name, CODING_ID_NAME (id)))
8363 setup_coding_system (name, &coding_categories[category]);
8364 }
8365
8366 return Qnil;
8367
8368 short_args:
8369 return Fsignal (Qwrong_number_of_arguments,
8370 Fcons (intern ("define-coding-system-internal"),
8371 make_number (nargs)));
8372 }
8373
8374 /* Fixme: should this record the alias relationships for
8375 diagnostics? Should it update coding-system-list? */
8376 DEFUN ("define-coding-system-alias", Fdefine_coding_system_alias,
8377 Sdefine_coding_system_alias, 2, 2, 0,
8378 doc: /* Define ALIAS as an alias for CODING-SYSTEM. */)
8379 (alias, coding_system)
8380 Lisp_Object alias, coding_system;
8381 {
8382 Lisp_Object spec, aliases, eol_type;
8383
8384 CHECK_SYMBOL (alias);
8385 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8386 aliases = AREF (spec, 1);
8387 while (!NILP (XCDR (aliases)))
8388 aliases = XCDR (aliases);
8389 XCDR (aliases) = Fcons (alias, Qnil);
8390
8391 eol_type = AREF (spec, 2);
8392 if (VECTORP (eol_type))
8393 {
8394 Lisp_Object subsidiaries;
8395 int i;
8396
8397 subsidiaries = make_subsidiaries (alias);
8398 for (i = 0; i < 3; i++)
8399 Fdefine_coding_system_alias (AREF (subsidiaries, i),
8400 AREF (eol_type, i));
8401
8402 ASET (spec, 2, subsidiaries);
8403 }
8404
8405 Fputhash (alias, spec, Vcoding_system_hash_table);
8406 Vcoding_system_alist = Fcons (Fcons (Fsymbol_name (alias), Qnil),
8407 Vcoding_system_alist);
8408
8409 return Qnil;
8410 }
8411
8412 DEFUN ("coding-system-base", Fcoding_system_base, Scoding_system_base,
8413 1, 1, 0,
8414 doc: /* Return the base of CODING-SYSTEM.
8415 Any alias or subsidiary coding system is not a base coding system. */)
8416 (coding_system)
8417 Lisp_Object coding_system;
8418 {
8419 Lisp_Object spec, attrs;
8420
8421 if (NILP (coding_system))
8422 return (Qno_conversion);
8423 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8424 attrs = AREF (spec, 0);
8425 return CODING_ATTR_BASE_NAME (attrs);
8426 }
8427
8428 DEFUN ("coding-system-plist", Fcoding_system_plist, Scoding_system_plist,
8429 1, 1, 0,
8430 doc: "Return the property list of CODING-SYSTEM.")
8431 (coding_system)
8432 Lisp_Object coding_system;
8433 {
8434 Lisp_Object spec, attrs;
8435
8436 if (NILP (coding_system))
8437 coding_system = Qno_conversion;
8438 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8439 attrs = AREF (spec, 0);
8440 return CODING_ATTR_PLIST (attrs);
8441 }
8442
8443
8444 DEFUN ("coding-system-aliases", Fcoding_system_aliases, Scoding_system_aliases,
8445 1, 1, 0,
8446 doc: /* Return the list of aliases of CODING-SYSTEM. */)
8447 (coding_system)
8448 Lisp_Object coding_system;
8449 {
8450 Lisp_Object spec;
8451
8452 if (NILP (coding_system))
8453 coding_system = Qno_conversion;
8454 CHECK_CODING_SYSTEM_GET_SPEC (coding_system, spec);
8455 return AREF (spec, 1);
8456 }
8457
8458 DEFUN ("coding-system-eol-type", Fcoding_system_eol_type,
8459 Scoding_system_eol_type, 1, 1, 0,
8460 doc: /* Return eol-type of CODING-SYSTEM.
8461 An eol-type is integer 0, 1, 2, or a vector of coding systems.
8462
8463 Integer values 0, 1, and 2 indicate a format of end-of-line; LF, CRLF,
8464 and CR respectively.
8465
8466 A vector value indicates that a format of end-of-line should be
8467 detected automatically. Nth element of the vector is the subsidiary
8468 coding system whose eol-type is N. */)
8469 (coding_system)
8470 Lisp_Object coding_system;
8471 {
8472 Lisp_Object spec, eol_type;
8473 int n;
8474
8475 if (NILP (coding_system))
8476 coding_system = Qno_conversion;
8477 if (! CODING_SYSTEM_P (coding_system))
8478 return Qnil;
8479 spec = CODING_SYSTEM_SPEC (coding_system);
8480 eol_type = AREF (spec, 2);
8481 if (VECTORP (eol_type))
8482 return Fcopy_sequence (eol_type);
8483 n = EQ (eol_type, Qunix) ? 0 : EQ (eol_type, Qdos) ? 1 : 2;
8484 return make_number (n);
8485 }
8486
8487 #endif /* emacs */
8488
8489 \f
8490 /*** 9. Post-amble ***/
8491
8492 void
8493 init_coding_once ()
8494 {
8495 int i;
8496
8497 for (i = 0; i < coding_category_max; i++)
8498 {
8499 coding_categories[i].id = -1;
8500 coding_priorities[i] = i;
8501 }
8502
8503 /* ISO2022 specific initialize routine. */
8504 for (i = 0; i < 0x20; i++)
8505 iso_code_class[i] = ISO_control_0;
8506 for (i = 0x21; i < 0x7F; i++)
8507 iso_code_class[i] = ISO_graphic_plane_0;
8508 for (i = 0x80; i < 0xA0; i++)
8509 iso_code_class[i] = ISO_control_1;
8510 for (i = 0xA1; i < 0xFF; i++)
8511 iso_code_class[i] = ISO_graphic_plane_1;
8512 iso_code_class[0x20] = iso_code_class[0x7F] = ISO_0x20_or_0x7F;
8513 iso_code_class[0xA0] = iso_code_class[0xFF] = ISO_0xA0_or_0xFF;
8514 iso_code_class[ISO_CODE_CR] = ISO_carriage_return;
8515 iso_code_class[ISO_CODE_SO] = ISO_shift_out;
8516 iso_code_class[ISO_CODE_SI] = ISO_shift_in;
8517 iso_code_class[ISO_CODE_SS2_7] = ISO_single_shift_2_7;
8518 iso_code_class[ISO_CODE_ESC] = ISO_escape;
8519 iso_code_class[ISO_CODE_SS2] = ISO_single_shift_2;
8520 iso_code_class[ISO_CODE_SS3] = ISO_single_shift_3;
8521 iso_code_class[ISO_CODE_CSI] = ISO_control_sequence_introducer;
8522
8523 inhibit_pre_post_conversion = 0;
8524
8525 for (i = 0; i < 256; i++)
8526 {
8527 emacs_mule_bytes[i] = 1;
8528 }
8529 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_11] = 3;
8530 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_12] = 3;
8531 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_21] = 4;
8532 emacs_mule_bytes[EMACS_MULE_LEADING_CODE_PRIVATE_22] = 4;
8533 }
8534
8535 #ifdef emacs
8536
8537 void
8538 syms_of_coding ()
8539 {
8540 staticpro (&Vcoding_system_hash_table);
8541 Vcoding_system_hash_table = Fmakehash (Qeq);
8542
8543 staticpro (&Vsjis_coding_system);
8544 Vsjis_coding_system = Qnil;
8545
8546 staticpro (&Vbig5_coding_system);
8547 Vbig5_coding_system = Qnil;
8548
8549 staticpro (&Vcode_conversion_work_buf_list);
8550 Vcode_conversion_work_buf_list = Qnil;
8551
8552 staticpro (&Vcode_conversion_reused_work_buf);
8553 Vcode_conversion_reused_work_buf = Qnil;
8554
8555 DEFSYM (Qcharset, "charset");
8556 DEFSYM (Qtarget_idx, "target-idx");
8557 DEFSYM (Qcoding_system_history, "coding-system-history");
8558 Fset (Qcoding_system_history, Qnil);
8559
8560 /* Target FILENAME is the first argument. */
8561 Fput (Qinsert_file_contents, Qtarget_idx, make_number (0));
8562 /* Target FILENAME is the third argument. */
8563 Fput (Qwrite_region, Qtarget_idx, make_number (2));
8564
8565 DEFSYM (Qcall_process, "call-process");
8566 /* Target PROGRAM is the first argument. */
8567 Fput (Qcall_process, Qtarget_idx, make_number (0));
8568
8569 DEFSYM (Qcall_process_region, "call-process-region");
8570 /* Target PROGRAM is the third argument. */
8571 Fput (Qcall_process_region, Qtarget_idx, make_number (2));
8572
8573 DEFSYM (Qstart_process, "start-process");
8574 /* Target PROGRAM is the third argument. */
8575 Fput (Qstart_process, Qtarget_idx, make_number (2));
8576
8577 DEFSYM (Qopen_network_stream, "open-network-stream");
8578 /* Target SERVICE is the fourth argument. */
8579 Fput (Qopen_network_stream, Qtarget_idx, make_number (3));
8580
8581 DEFSYM (Qcoding_system, "coding-system");
8582 DEFSYM (Qcoding_aliases, "coding-aliases");
8583
8584 DEFSYM (Qeol_type, "eol-type");
8585 DEFSYM (Qunix, "unix");
8586 DEFSYM (Qdos, "dos");
8587
8588 DEFSYM (Qbuffer_file_coding_system, "buffer-file-coding-system");
8589 DEFSYM (Qpost_read_conversion, "post-read-conversion");
8590 DEFSYM (Qpre_write_conversion, "pre-write-conversion");
8591 DEFSYM (Qdefault_char, "default-char");
8592 DEFSYM (Qundecided, "undecided");
8593 DEFSYM (Qno_conversion, "no-conversion");
8594 DEFSYM (Qraw_text, "raw-text");
8595
8596 DEFSYM (Qiso_2022, "iso-2022");
8597
8598 DEFSYM (Qutf_8, "utf-8");
8599
8600 DEFSYM (Qutf_16, "utf-16");
8601 DEFSYM (Qbig, "big");
8602 DEFSYM (Qlittle, "little");
8603
8604 DEFSYM (Qshift_jis, "shift-jis");
8605 DEFSYM (Qbig5, "big5");
8606
8607 DEFSYM (Qcoding_system_p, "coding-system-p");
8608
8609 DEFSYM (Qcoding_system_error, "coding-system-error");
8610 Fput (Qcoding_system_error, Qerror_conditions,
8611 Fcons (Qcoding_system_error, Fcons (Qerror, Qnil)));
8612 Fput (Qcoding_system_error, Qerror_message,
8613 build_string ("Invalid coding system"));
8614
8615 /* Intern this now in case it isn't already done.
8616 Setting this variable twice is harmless.
8617 But don't staticpro it here--that is done in alloc.c. */
8618 Qchar_table_extra_slots = intern ("char-table-extra-slots");
8619
8620 DEFSYM (Qtranslation_table, "translation-table");
8621 Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1));
8622 DEFSYM (Qtranslation_table_id, "translation-table-id");
8623 DEFSYM (Qtranslation_table_for_decode, "translation-table-for-decode");
8624 DEFSYM (Qtranslation_table_for_encode, "translation-table-for-encode");
8625
8626 DEFSYM (Qvalid_codes, "valid-codes");
8627
8628 DEFSYM (Qemacs_mule, "emacs-mule");
8629
8630 DEFSYM (QCcategory, ":category");
8631
8632 Vcoding_category_table
8633 = Fmake_vector (make_number (coding_category_max), Qnil);
8634 staticpro (&Vcoding_category_table);
8635 /* Followings are target of code detection. */
8636 ASET (Vcoding_category_table, coding_category_iso_7,
8637 intern ("coding-category-iso-7"));
8638 ASET (Vcoding_category_table, coding_category_iso_7_tight,
8639 intern ("coding-category-iso-7-tight"));
8640 ASET (Vcoding_category_table, coding_category_iso_8_1,
8641 intern ("coding-category-iso-8-1"));
8642 ASET (Vcoding_category_table, coding_category_iso_8_2,
8643 intern ("coding-category-iso-8-2"));
8644 ASET (Vcoding_category_table, coding_category_iso_7_else,
8645 intern ("coding-category-iso-7-else"));
8646 ASET (Vcoding_category_table, coding_category_iso_8_else,
8647 intern ("coding-category-iso-8-else"));
8648 ASET (Vcoding_category_table, coding_category_utf_8,
8649 intern ("coding-category-utf-8"));
8650 ASET (Vcoding_category_table, coding_category_utf_16_be,
8651 intern ("coding-category-utf-16-be"));
8652 ASET (Vcoding_category_table, coding_category_utf_16_auto,
8653 intern ("coding-category-utf-16-auto"));
8654 ASET (Vcoding_category_table, coding_category_utf_16_le,
8655 intern ("coding-category-utf-16-le"));
8656 ASET (Vcoding_category_table, coding_category_utf_16_be_nosig,
8657 intern ("coding-category-utf-16-be-nosig"));
8658 ASET (Vcoding_category_table, coding_category_utf_16_le_nosig,
8659 intern ("coding-category-utf-16-le-nosig"));
8660 ASET (Vcoding_category_table, coding_category_charset,
8661 intern ("coding-category-charset"));
8662 ASET (Vcoding_category_table, coding_category_sjis,
8663 intern ("coding-category-sjis"));
8664 ASET (Vcoding_category_table, coding_category_big5,
8665 intern ("coding-category-big5"));
8666 ASET (Vcoding_category_table, coding_category_ccl,
8667 intern ("coding-category-ccl"));
8668 ASET (Vcoding_category_table, coding_category_emacs_mule,
8669 intern ("coding-category-emacs-mule"));
8670 /* Followings are NOT target of code detection. */
8671 ASET (Vcoding_category_table, coding_category_raw_text,
8672 intern ("coding-category-raw-text"));
8673 ASET (Vcoding_category_table, coding_category_undecided,
8674 intern ("coding-category-undecided"));
8675
8676 defsubr (&Scoding_system_p);
8677 defsubr (&Sread_coding_system);
8678 defsubr (&Sread_non_nil_coding_system);
8679 defsubr (&Scheck_coding_system);
8680 defsubr (&Sdetect_coding_region);
8681 defsubr (&Sdetect_coding_string);
8682 defsubr (&Sfind_coding_systems_region_internal);
8683 defsubr (&Scheck_coding_systems_region);
8684 defsubr (&Sdecode_coding_region);
8685 defsubr (&Sencode_coding_region);
8686 defsubr (&Sdecode_coding_string);
8687 defsubr (&Sencode_coding_string);
8688 defsubr (&Sdecode_sjis_char);
8689 defsubr (&Sencode_sjis_char);
8690 defsubr (&Sdecode_big5_char);
8691 defsubr (&Sencode_big5_char);
8692 defsubr (&Sset_terminal_coding_system_internal);
8693 defsubr (&Sset_safe_terminal_coding_system_internal);
8694 defsubr (&Sterminal_coding_system);
8695 defsubr (&Sset_keyboard_coding_system_internal);
8696 defsubr (&Skeyboard_coding_system);
8697 defsubr (&Sfind_operation_coding_system);
8698 defsubr (&Sset_coding_system_priority);
8699 defsubr (&Sdefine_coding_system_internal);
8700 defsubr (&Sdefine_coding_system_alias);
8701 defsubr (&Scoding_system_base);
8702 defsubr (&Scoding_system_plist);
8703 defsubr (&Scoding_system_aliases);
8704 defsubr (&Scoding_system_eol_type);
8705 defsubr (&Scoding_system_priority_list);
8706
8707 DEFVAR_LISP ("coding-system-list", &Vcoding_system_list,
8708 doc: /* List of coding systems.
8709
8710 Do not alter the value of this variable manually. This variable should be
8711 updated by the functions `define-coding-system' and
8712 `define-coding-system-alias'. */);
8713 Vcoding_system_list = Qnil;
8714
8715 DEFVAR_LISP ("coding-system-alist", &Vcoding_system_alist,
8716 doc: /* Alist of coding system names.
8717 Each element is one element list of coding system name.
8718 This variable is given to `completing-read' as TABLE argument.
8719
8720 Do not alter the value of this variable manually. This variable should be
8721 updated by the functions `make-coding-system' and
8722 `define-coding-system-alias'. */);
8723 Vcoding_system_alist = Qnil;
8724
8725 DEFVAR_LISP ("coding-category-list", &Vcoding_category_list,
8726 doc: /* List of coding-categories (symbols) ordered by priority.
8727
8728 On detecting a coding system, Emacs tries code detection algorithms
8729 associated with each coding-category one by one in this order. When
8730 one algorithm agrees with a byte sequence of source text, the coding
8731 system bound to the corresponding coding-category is selected. */);
8732 {
8733 int i;
8734
8735 Vcoding_category_list = Qnil;
8736 for (i = coding_category_max - 1; i >= 0; i--)
8737 Vcoding_category_list
8738 = Fcons (XVECTOR (Vcoding_category_table)->contents[i],
8739 Vcoding_category_list);
8740 }
8741
8742 DEFVAR_LISP ("coding-system-for-read", &Vcoding_system_for_read,
8743 doc: /* Specify the coding system for read operations.
8744 It is useful to bind this variable with `let', but do not set it globally.
8745 If the value is a coding system, it is used for decoding on read operation.
8746 If not, an appropriate element is used from one of the coding system alists:
8747 There are three such tables, `file-coding-system-alist',
8748 `process-coding-system-alist', and `network-coding-system-alist'. */);
8749 Vcoding_system_for_read = Qnil;
8750
8751 DEFVAR_LISP ("coding-system-for-write", &Vcoding_system_for_write,
8752 doc: /* Specify the coding system for write operations.
8753 Programs bind this variable with `let', but you should not set it globally.
8754 If the value is a coding system, it is used for encoding of output,
8755 when writing it to a file and when sending it to a file or subprocess.
8756
8757 If this does not specify a coding system, an appropriate element
8758 is used from one of the coding system alists:
8759 There are three such tables, `file-coding-system-alist',
8760 `process-coding-system-alist', and `network-coding-system-alist'.
8761 For output to files, if the above procedure does not specify a coding system,
8762 the value of `buffer-file-coding-system' is used. */);
8763 Vcoding_system_for_write = Qnil;
8764
8765 DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used,
8766 doc: /*
8767 Coding system used in the latest file or process I/O. */);
8768 Vlast_coding_system_used = Qnil;
8769
8770 DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion,
8771 doc: /*
8772 *Non-nil means always inhibit code conversion of end-of-line format.
8773 See info node `Coding Systems' and info node `Text and Binary' concerning
8774 such conversion. */);
8775 inhibit_eol_conversion = 0;
8776
8777 DEFVAR_BOOL ("inherit-process-coding-system", &inherit_process_coding_system,
8778 doc: /*
8779 Non-nil means process buffer inherits coding system of process output.
8780 Bind it to t if the process output is to be treated as if it were a file
8781 read from some filesystem. */);
8782 inherit_process_coding_system = 0;
8783
8784 DEFVAR_LISP ("file-coding-system-alist", &Vfile_coding_system_alist,
8785 doc: /*
8786 Alist to decide a coding system to use for a file I/O operation.
8787 The format is ((PATTERN . VAL) ...),
8788 where PATTERN is a regular expression matching a file name,
8789 VAL is a coding system, a cons of coding systems, or a function symbol.
8790 If VAL is a coding system, it is used for both decoding and encoding
8791 the file contents.
8792 If VAL is a cons of coding systems, the car part is used for decoding,
8793 and the cdr part is used for encoding.
8794 If VAL is a function symbol, the function must return a coding system
8795 or a cons of coding systems which are used as above. The function gets
8796 the arguments with which `find-operation-coding-systems' was called.
8797
8798 See also the function `find-operation-coding-system'
8799 and the variable `auto-coding-alist'. */);
8800 Vfile_coding_system_alist = Qnil;
8801
8802 DEFVAR_LISP ("process-coding-system-alist", &Vprocess_coding_system_alist,
8803 doc: /*
8804 Alist to decide a coding system to use for a process I/O operation.
8805 The format is ((PATTERN . VAL) ...),
8806 where PATTERN is a regular expression matching a program name,
8807 VAL is a coding system, a cons of coding systems, or a function symbol.
8808 If VAL is a coding system, it is used for both decoding what received
8809 from the program and encoding what sent to the program.
8810 If VAL is a cons of coding systems, the car part is used for decoding,
8811 and the cdr part is used for encoding.
8812 If VAL is a function symbol, the function must return a coding system
8813 or a cons of coding systems which are used as above.
8814
8815 See also the function `find-operation-coding-system'. */);
8816 Vprocess_coding_system_alist = Qnil;
8817
8818 DEFVAR_LISP ("network-coding-system-alist", &Vnetwork_coding_system_alist,
8819 doc: /*
8820 Alist to decide a coding system to use for a network I/O operation.
8821 The format is ((PATTERN . VAL) ...),
8822 where PATTERN is a regular expression matching a network service name
8823 or is a port number to connect to,
8824 VAL is a coding system, a cons of coding systems, or a function symbol.
8825 If VAL is a coding system, it is used for both decoding what received
8826 from the network stream and encoding what sent to the network stream.
8827 If VAL is a cons of coding systems, the car part is used for decoding,
8828 and the cdr part is used for encoding.
8829 If VAL is a function symbol, the function must return a coding system
8830 or a cons of coding systems which are used as above.
8831
8832 See also the function `find-operation-coding-system'. */);
8833 Vnetwork_coding_system_alist = Qnil;
8834
8835 DEFVAR_LISP ("locale-coding-system", &Vlocale_coding_system,
8836 doc: /* Coding system to use with system messages.
8837 Also used for decoding keyboard input on X Window system. */);
8838 Vlocale_coding_system = Qnil;
8839
8840 /* The eol mnemonics are reset in startup.el system-dependently. */
8841 DEFVAR_LISP ("eol-mnemonic-unix", &eol_mnemonic_unix,
8842 doc: /*
8843 *String displayed in mode line for UNIX-like (LF) end-of-line format. */);
8844 eol_mnemonic_unix = build_string (":");
8845
8846 DEFVAR_LISP ("eol-mnemonic-dos", &eol_mnemonic_dos,
8847 doc: /*
8848 *String displayed in mode line for DOS-like (CRLF) end-of-line format. */);
8849 eol_mnemonic_dos = build_string ("\\");
8850
8851 DEFVAR_LISP ("eol-mnemonic-mac", &eol_mnemonic_mac,
8852 doc: /*
8853 *String displayed in mode line for MAC-like (CR) end-of-line format. */);
8854 eol_mnemonic_mac = build_string ("/");
8855
8856 DEFVAR_LISP ("eol-mnemonic-undecided", &eol_mnemonic_undecided,
8857 doc: /*
8858 *String displayed in mode line when end-of-line format is not yet determined. */);
8859 eol_mnemonic_undecided = build_string (":");
8860
8861 DEFVAR_LISP ("enable-character-translation", &Venable_character_translation,
8862 doc: /*
8863 *Non-nil enables character translation while encoding and decoding. */);
8864 Venable_character_translation = Qt;
8865
8866 DEFVAR_LISP ("standard-translation-table-for-decode",
8867 &Vstandard_translation_table_for_decode,
8868 doc: /* Table for translating characters while decoding. */);
8869 Vstandard_translation_table_for_decode = Qnil;
8870
8871 DEFVAR_LISP ("standard-translation-table-for-encode",
8872 &Vstandard_translation_table_for_encode,
8873 doc: /* Table for translating characters while encoding. */);
8874 Vstandard_translation_table_for_encode = Qnil;
8875
8876 DEFVAR_LISP ("charset-revision-table", &Vcharset_revision_table,
8877 doc: /* Alist of charsets vs revision numbers.
8878 While encoding, if a charset (car part of an element) is found,
8879 designate it with the escape sequence identifying revision (cdr part
8880 of the element). */);
8881 Vcharset_revision_table = Qnil;
8882
8883 DEFVAR_LISP ("default-process-coding-system",
8884 &Vdefault_process_coding_system,
8885 doc: /* Cons of coding systems used for process I/O by default.
8886 The car part is used for decoding a process output,
8887 the cdr part is used for encoding a text to be sent to a process. */);
8888 Vdefault_process_coding_system = Qnil;
8889
8890 DEFVAR_LISP ("latin-extra-code-table", &Vlatin_extra_code_table,
8891 doc: /*
8892 Table of extra Latin codes in the range 128..159 (inclusive).
8893 This is a vector of length 256.
8894 If Nth element is non-nil, the existence of code N in a file
8895 \(or output of subprocess) doesn't prevent it to be detected as
8896 a coding system of ISO 2022 variant which has a flag
8897 `accept-latin-extra-code' t (e.g. iso-latin-1) on reading a file
8898 or reading output of a subprocess.
8899 Only 128th through 159th elements has a meaning. */);
8900 Vlatin_extra_code_table = Fmake_vector (make_number (256), Qnil);
8901
8902 DEFVAR_LISP ("select-safe-coding-system-function",
8903 &Vselect_safe_coding_system_function,
8904 doc: /*
8905 Function to call to select safe coding system for encoding a text.
8906
8907 If set, this function is called to force a user to select a proper
8908 coding system which can encode the text in the case that a default
8909 coding system used in each operation can't encode the text.
8910
8911 The default value is `select-safe-coding-system' (which see). */);
8912 Vselect_safe_coding_system_function = Qnil;
8913
8914 DEFVAR_BOOL ("inhibit-iso-escape-detection",
8915 &inhibit_iso_escape_detection,
8916 doc: /*
8917 If non-nil, Emacs ignores ISO2022's escape sequence on code detection.
8918
8919 By default, on reading a file, Emacs tries to detect how the text is
8920 encoded. This code detection is sensitive to escape sequences. If
8921 the sequence is valid as ISO2022, the code is determined as one of
8922 the ISO2022 encodings, and the file is decoded by the corresponding
8923 coding system (e.g. `iso-2022-7bit').
8924
8925 However, there may be a case that you want to read escape sequences in
8926 a file as is. In such a case, you can set this variable to non-nil.
8927 Then, as the code detection ignores any escape sequences, no file is
8928 detected as encoded in some ISO2022 encoding. The result is that all
8929 escape sequences become visible in a buffer.
8930
8931 The default value is nil, and it is strongly recommended not to change
8932 it. That is because many Emacs Lisp source files that contain
8933 non-ASCII characters are encoded by the coding system `iso-2022-7bit'
8934 in Emacs's distribution, and they won't be decoded correctly on
8935 reading if you suppress escape sequence detection.
8936
8937 The other way to read escape sequences in a file without decoding is
8938 to explicitly specify some coding system that doesn't use ISO2022's
8939 escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */);
8940 inhibit_iso_escape_detection = 0;
8941
8942 {
8943 Lisp_Object args[coding_arg_max];
8944 Lisp_Object plist[14];
8945 int i;
8946
8947 for (i = 0; i < coding_arg_max; i++)
8948 args[i] = Qnil;
8949
8950 plist[0] = intern (":name");
8951 plist[1] = args[coding_arg_name] = Qno_conversion;
8952 plist[2] = intern (":mnemonic");
8953 plist[3] = args[coding_arg_mnemonic] = make_number ('=');
8954 plist[4] = intern (":coding-type");
8955 plist[5] = args[coding_arg_coding_type] = Qraw_text;
8956 plist[6] = intern (":ascii-compatible-p");
8957 plist[7] = args[coding_arg_ascii_compatible_p] = Qt;
8958 plist[8] = intern (":default-char");
8959 plist[9] = args[coding_arg_default_char] = make_number (0);
8960 plist[10] = intern (":docstring");
8961 plist[11] = build_string ("Do no conversion.\n\
8962 \n\
8963 When you visit a file with this coding, the file is read into a\n\
8964 unibyte buffer as is, thus each byte of a file is treated as a\n\
8965 character.");
8966 plist[12] = intern (":eol-type");
8967 plist[13] = args[coding_arg_eol_type] = Qunix;
8968 args[coding_arg_plist] = Flist (14, plist);
8969 Fdefine_coding_system_internal (coding_arg_max, args);
8970 }
8971
8972 setup_coding_system (Qno_conversion, &keyboard_coding);
8973 setup_coding_system (Qno_conversion, &terminal_coding);
8974 setup_coding_system (Qno_conversion, &safe_terminal_coding);
8975
8976 {
8977 int i;
8978
8979 for (i = 0; i < coding_category_max; i++)
8980 Fset (AREF (Vcoding_category_table, i), Qno_conversion);
8981 }
8982 }
8983
8984 char *
8985 emacs_strerror (error_number)
8986 int error_number;
8987 {
8988 char *str;
8989
8990 synchronize_system_messages_locale ();
8991 str = strerror (error_number);
8992
8993 if (! NILP (Vlocale_coding_system))
8994 {
8995 Lisp_Object dec = code_convert_string_norecord (build_string (str),
8996 Vlocale_coding_system,
8997 0);
8998 str = (char *) XSTRING (dec)->data;
8999 }
9000
9001 return str;
9002 }
9003
9004 #endif /* emacs */