X-Git-Url: https://git.hcoop.net/bpt/emacs.git/blobdiff_plain/7d0393cf12f2e50ee50e1a1fd73a60e9ef1f56ac..ab5796a9f97180707734a81320e3eb81937281fe:/src/coding.c diff --git a/src/coding.c b/src/coding.c index 3fa383ad7b..6d36cc397c 100644 --- a/src/coding.c +++ b/src/coding.c @@ -1,7 +1,7 @@ /* Coding system handler (conversion, detection, and etc). - Copyright (C) 1995, 1997, 1998, 2002 Electrotechnical Laboratory, JAPAN. + Copyright (C) 1995,97,1998,2002,2003 Electrotechnical Laboratory, JAPAN. Licensed to the Free Software Foundation. - Copyright (C) 2001 Free Software Foundation, Inc. + Copyright (C) 2001,2002,2003 Free Software Foundation, Inc. This file is part of GNU Emacs. @@ -345,6 +345,7 @@ encode_coding_XXX (coding, source, destination, src_bytes, dst_bytes) #include "ccl.h" #include "coding.h" #include "window.h" +#include "intervals.h" #else /* not emacs */ @@ -367,6 +368,8 @@ Lisp_Object Qtarget_idx; Lisp_Object Vselect_safe_coding_system_function; +int coding_system_require_warning; + /* Mnemonic string for each format of end-of-line. */ Lisp_Object eol_mnemonic_unix, eol_mnemonic_dos, eol_mnemonic_mac; /* Mnemonic string to indicate format of end-of-line is not yet @@ -379,6 +382,16 @@ int system_eol_type; #ifdef emacs +/* Information about which coding system is safe for which chars. + The value has the form (GENERIC-LIST . NON-GENERIC-ALIST). + + GENERIC-LIST is a list of generic coding systems which can encode + any characters. + + NON-GENERIC-ALIST is an alist of non generic coding systems vs the + corresponding char table that contains safe chars. */ +Lisp_Object Vcoding_system_safe_chars; + Lisp_Object Vcoding_system_list, Vcoding_system_alist; Lisp_Object Qcoding_system_p, Qcoding_system_error; @@ -387,6 +400,8 @@ Lisp_Object Qcoding_system_p, Qcoding_system_error; end-of-line format. */ Lisp_Object Qemacs_mule, Qraw_text; +Lisp_Object Qutf_8; + /* Coding-systems are handed between Emacs Lisp programs and C internal routines by the following three variables. */ /* Coding-system for reading files and receiving data from process. */ @@ -485,26 +500,27 @@ Lisp_Object Vcharset_revision_alist; /* Default coding systems used for process I/O. */ Lisp_Object Vdefault_process_coding_system; +/* Char table for translating Quail and self-inserting input. */ +Lisp_Object Vtranslation_table_for_input; + /* Global flag to tell that we can't call post-read-conversion and pre-write-conversion functions. Usually the value is zero, but it is set to 1 temporarily while such functions are running. This is to avoid infinite recursive call. */ static int inhibit_pre_post_conversion; -/* Char-table containing safe coding systems of each character. */ -Lisp_Object Vchar_coding_system_table; Lisp_Object Qchar_coding_system; -/* Return `safe-chars' property of coding system CODING. Don't check - validity of CODING. */ +/* Return `safe-chars' property of CODING_SYSTEM (symbol). Don't check + its validity. */ Lisp_Object -coding_safe_chars (coding) - struct coding_system *coding; +coding_safe_chars (coding_system) + Lisp_Object coding_system; { Lisp_Object coding_spec, plist, safe_chars; - coding_spec = Fget (coding->symbol, Qcoding_system); + coding_spec = Fget (coding_system, Qcoding_system); plist = XVECTOR (coding_spec)->contents[3]; safe_chars = Fplist_get (XVECTOR (coding_spec)->contents[3], Qsafe_chars); return (CHAR_TABLE_P (safe_chars) ? safe_chars : Qt); @@ -673,8 +689,16 @@ detect_coding_emacs_mule (src, src_end, multibytep) /* Record one COMPONENT (alternate character or composition rule). */ -#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \ - (coding->cmp_data->data[coding->cmp_data->used++] = component) +#define CODING_ADD_COMPOSITION_COMPONENT(coding, component) \ + do { \ + coding->cmp_data->data[coding->cmp_data->used++] = component; \ + if (coding->cmp_data->used - coding->cmp_data_start \ + == COMPOSITION_DATA_MAX_BUNCH_LENGTH) \ + { \ + CODING_ADD_COMPOSITION_END (coding, coding->produced_char); \ + coding->composing = COMPOSITION_NO; \ + } \ + } while (0) /* Get one byte from a data pointed by SRC and increment SRC. If SRC @@ -691,7 +715,7 @@ detect_coding_emacs_mule (src, src_end, multibytep) #define DECODE_EMACS_MULE_COMPOSITION_CHAR(c, p) \ do { \ int bytes; \ - \ + \ c = SAFE_ONE_MORE_BYTE (); \ if (c < 0) \ break; \ @@ -722,7 +746,10 @@ detect_coding_emacs_mule (src, src_end, multibytep) break; \ *p++ = c; \ } \ - if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes)) \ + if (UNIBYTE_STR_AS_MULTIBYTE_P (p0, p - p0, bytes) \ + || (coding->flags /* We are recovering a file. */ \ + && p0[0] == LEADING_CODE_8_BIT_CONTROL \ + && ! CHAR_HEAD_P (p0[1]))) \ c = STRING_CHAR (p0, bytes); \ else \ c = -1; \ @@ -826,7 +853,10 @@ decode_composition_emacs_mule (coding, src, src_end, else { int bytes; - if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)) + if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes) + || (coding->flags /* We are recovering a file. */ + && src[0] == LEADING_CODE_8_BIT_CONTROL + && ! CHAR_HEAD_P (src[1]))) c = STRING_CHAR (src, bytes); else c = *src, bytes = 1; @@ -942,11 +972,6 @@ decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) ONE_MORE_BYTE (c); if (c != '\n') { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } src--; c = '\r'; } @@ -985,7 +1010,10 @@ decode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) p = tmp; src++; } - else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes)) + else if (UNIBYTE_STR_AS_MULTIBYTE_P (src, src_end - src, bytes) + || (coding->flags /* We are recovering a file. */ + && src[0] == LEADING_CODE_8_BIT_CONTROL + && ! CHAR_HEAD_P (src[1]))) { p = src; src += bytes; @@ -1116,7 +1144,22 @@ encode_coding_emacs_mule (coding, source, destination, src_bytes, dst_bytes) EMIT_ONE_BYTE ('\r'); } else if (SINGLE_BYTE_CHAR_P (c)) - EMIT_ONE_BYTE (c); + { + if (coding->flags && ! ASCII_BYTE_P (c)) + { + /* As we are auto saving, retain the multibyte form for + 8-bit chars. */ + unsigned char buf[MAX_MULTIBYTE_LENGTH]; + int bytes = CHAR_STRING (c, buf); + + if (bytes == 1) + EMIT_ONE_BYTE (buf[0]); + else + EMIT_TWO_BYTES (buf[0], buf[1]); + } + else + EMIT_ONE_BYTE (c); + } else EMIT_BYTES (src_base, src); coding->consumed_char++; @@ -1310,7 +1353,7 @@ enum iso_code_class_type iso_code_class[256]; #define CHARSET_OK(idx, charset, c) \ (coding_system_table[idx] \ && (charset == CHARSET_ASCII \ - || (safe_chars = coding_safe_chars (coding_system_table[idx]), \ + || (safe_chars = coding_safe_chars (coding_system_table[idx]->symbol), \ CODING_SAFE_CHAR_P (safe_chars, c))) \ && (CODING_SPEC_ISO_REQUESTED_DESIGNATION (coding_system_table[idx], \ charset) \ @@ -1319,6 +1362,9 @@ enum iso_code_class_type iso_code_class[256]; #define SHIFT_OUT_OK(idx) \ (CODING_SPEC_ISO_INITIAL_DESIGNATION (coding_system_table[idx], 1) >= 0) +#define COMPOSITION_OK(idx) \ + (coding_system_table[idx]->composing != COMPOSITION_DISABLED) + /* See the above "GENERAL NOTES on `detect_coding_XXX ()' functions". Check if a text is encoded in ISO2022. If it is, return an integer in which appropriate flag bits any of: @@ -1396,7 +1442,30 @@ detect_coding_iso2022 (src, src_end, multibytep) else if (c >= '0' && c <= '4') { /* ESC for start/end composition. */ - mask_found |= CODING_CATEGORY_MASK_ISO; + if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7)) + mask_found |= CODING_CATEGORY_MASK_ISO_7; + else + mask &= ~CODING_CATEGORY_MASK_ISO_7; + if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_TIGHT)) + mask_found |= CODING_CATEGORY_MASK_ISO_7_TIGHT; + else + mask &= ~CODING_CATEGORY_MASK_ISO_7_TIGHT; + if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_1)) + mask_found |= CODING_CATEGORY_MASK_ISO_8_1; + else + mask &= ~CODING_CATEGORY_MASK_ISO_8_1; + if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_2)) + mask_found |= CODING_CATEGORY_MASK_ISO_8_2; + else + mask &= ~CODING_CATEGORY_MASK_ISO_8_2; + if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_7_ELSE)) + mask_found |= CODING_CATEGORY_MASK_ISO_7_ELSE; + else + mask &= ~CODING_CATEGORY_MASK_ISO_7_ELSE; + if (COMPOSITION_OK (CODING_CATEGORY_IDX_ISO_8_ELSE)) + mask_found |= CODING_CATEGORY_MASK_ISO_8_ELSE; + else + mask &= ~CODING_CATEGORY_MASK_ISO_8_ELSE; break; } else @@ -1739,7 +1808,7 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) Lisp_Object translation_table; Lisp_Object safe_chars; - safe_chars = coding_safe_chars (coding); + safe_chars = coding_safe_chars (coding->symbol); if (NILP (Venable_character_translation)) translation_table = Qnil; @@ -1754,7 +1823,7 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) while (1) { - int c1, c2; + int c1, c2 = 0; src_base = src; ONE_MORE_BYTE (c1); @@ -1830,11 +1899,6 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) ONE_MORE_BYTE (c1); if (c1 != ISO_CODE_LF) { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } src--; c1 = '\r'; } @@ -1999,6 +2063,78 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) } continue; + case '%': + if (COMPOSING_P (coding)) + DECODE_COMPOSITION_END ('1'); + ONE_MORE_BYTE (c1); + if (c1 == '/') + { + /* CTEXT extended segment: + ESC % / [0-4] M L --ENCODING-NAME-- \002 --BYTES-- + We keep these bytes as is for the moment. + They may be decoded by post-read-conversion. */ + int dim, M, L; + int size, required; + int produced_chars; + + ONE_MORE_BYTE (dim); + ONE_MORE_BYTE (M); + ONE_MORE_BYTE (L); + size = ((M - 128) * 128) + (L - 128); + required = 8 + size * 2; + if (dst + required > (dst_bytes ? dst_end : src)) + goto label_end_of_loop; + *dst++ = ISO_CODE_ESC; + *dst++ = '%'; + *dst++ = '/'; + *dst++ = dim; + produced_chars = 4; + dst += CHAR_STRING (M, dst), produced_chars++; + dst += CHAR_STRING (L, dst), produced_chars++; + while (size-- > 0) + { + ONE_MORE_BYTE (c1); + dst += CHAR_STRING (c1, dst), produced_chars++; + } + coding->produced_char += produced_chars; + } + else if (c1 == 'G') + { + unsigned char *d = dst; + int produced_chars; + + /* XFree86 extension for embedding UTF-8 in CTEXT: + ESC % G --UTF-8-BYTES-- ESC % @ + We keep these bytes as is for the moment. + They may be decoded by post-read-conversion. */ + if (d + 6 > (dst_bytes ? dst_end : src)) + goto label_end_of_loop; + *d++ = ISO_CODE_ESC; + *d++ = '%'; + *d++ = 'G'; + produced_chars = 3; + while (d + 1 < (dst_bytes ? dst_end : src)) + { + ONE_MORE_BYTE (c1); + if (c1 == ISO_CODE_ESC + && src + 1 < src_end + && src[0] == '%' + && src[1] == '@') + break; + d += CHAR_STRING (c1, d), produced_chars++; + } + if (d + 3 > (dst_bytes ? dst_end : src)) + goto label_end_of_loop; + *d++ = ISO_CODE_ESC; + *d++ = '%'; + *d++ = '@'; + dst = d; + coding->produced_char += produced_chars + 3; + } + else + goto label_invalid_code; + continue; + default: if (! (coding->flags & CODING_FLAG_ISO_DESIGNATION)) goto label_invalid_code; @@ -2264,11 +2400,11 @@ decode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) /* Instead of encoding character C, produce one or two `?'s. */ -#define ENCODE_UNSAFE_CHARACTER(c) \ - do { \ - ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \ - if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \ - ENCODE_ISO_CHARACTER (CODING_INHIBIT_CHARACTER_SUBSTITUTION); \ +#define ENCODE_UNSAFE_CHARACTER(c) \ + do { \ + ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \ + if (CHARSET_WIDTH (CHAR_CHARSET (c)) > 1) \ + ENCODE_ISO_CHARACTER (CODING_REPLACEMENT_CHARACTER); \ } while (0) @@ -2497,7 +2633,10 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) Lisp_Object translation_table; Lisp_Object safe_chars; - safe_chars = coding_safe_chars (coding); + if (coding->flags & CODING_FLAG_ISO_SAFE) + coding->mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR; + + safe_chars = coding_safe_chars (coding->symbol); if (NILP (Venable_character_translation)) translation_table = Qnil; @@ -2564,7 +2703,7 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) } else { - if (coding->flags & CODING_FLAG_ISO_SAFE + if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR && ! CODING_SAFE_CHAR_P (safe_chars, c)) ENCODE_UNSAFE_CHARACTER (c); else @@ -2633,7 +2772,7 @@ encode_coding_iso2022 (coding, source, destination, src_bytes, dst_bytes) *dst++ = c; coding->errors++; } - else if (coding->flags & CODING_FLAG_ISO_SAFE + else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR && ! CODING_SAFE_CHAR_P (safe_chars, c)) ENCODE_UNSAFE_CHARACTER (c); else @@ -2860,7 +2999,7 @@ detect_coding_utf_16 (src, src_end, multibytep) int multibytep; { unsigned char c1, c2; - /* Dummy for TWO_MORE_BYTES. */ + /* Dummy for ONE_MORE_BYTE_CHECK_MULTIBYTE. */ struct coding_system dummy_coding; struct coding_system *coding = &dummy_coding; @@ -2911,7 +3050,7 @@ decode_coding_sjis_big5 (coding, source, destination, coding->produced_char = 0; while (1) { - int c, charset, c1, c2; + int c, charset, c1, c2 = 0; src_base = src; ONE_MORE_BYTE (c1); @@ -2928,12 +3067,6 @@ decode_coding_sjis_big5 (coding, source, destination, ONE_MORE_BYTE (c2); if (c2 == '\n') c1 = c2; - else if (coding->mode - & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } else /* To process C2 again, SRC is subtracted by 1. */ src--; @@ -3082,6 +3215,12 @@ encode_coding_sjis_big5 (coding, source, destination, EMIT_ONE_BYTE (c1 | 0x80); else if (charset == charset_latin_jisx0201) EMIT_ONE_BYTE (c1); + else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR) + { + EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); + if (CHARSET_WIDTH (charset) > 1) + EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); + } else /* There's no way other than producing the internal codes as is. */ @@ -3094,6 +3233,12 @@ encode_coding_sjis_big5 (coding, source, destination, ENCODE_BIG5 (charset, c1, c2, c1, c2); EMIT_TWO_BYTES (c1, c2); } + else if (coding->mode & CODING_MODE_INHIBIT_UNENCODABLE_CHAR) + { + EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); + if (CHARSET_WIDTH (charset) > 1) + EMIT_ONE_BYTE (CODING_REPLACEMENT_CHARACTER); + } else /* There's no way other than producing the internal codes as is. */ @@ -3179,11 +3324,6 @@ decode_eol (coding, source, destination, src_bytes, dst_bytes) ONE_MORE_BYTE (c); if (c != '\n') { - if (coding->mode & CODING_MODE_INHIBIT_INCONSISTENT_EOL) - { - coding->result = CODING_FINISH_INCONSISTENT_EOL; - goto label_end_of_loop; - } src--; c = '\r'; } @@ -4366,7 +4506,11 @@ encoding_buffer_size (coding, src_bytes) int magnification; if (coding->type == coding_type_ccl) - magnification = coding->spec.ccl.encoder.buf_magnification; + { + magnification = coding->spec.ccl.encoder.buf_magnification; + if (coding->eol_type == CODING_EOL_CRLF) + magnification *= 2; + } else if (CODING_REQUIRE_ENCODING (coding)) magnification = 3; else @@ -4451,7 +4595,10 @@ ccl_coding_driver (coding, source, destination, src_bytes, dst_bytes, encodep) if (ccl->eol_type ==CODING_EOL_UNDECIDED) ccl->eol_type = CODING_EOL_LF; ccl->cr_consumed = coding->spec.ccl.cr_carryover; + ccl->eight_bit_control = coding->dst_multibyte; } + else + ccl->eight_bit_control = 1; ccl->multibyte = coding->src_multibyte; if (coding->spec.ccl.eight_bit_carryover[0] != 0) { @@ -5174,10 +5321,11 @@ static int shrink_conversion_region_threshhold = 1024; } while (0) static Lisp_Object -code_convert_region_unwind (dummy) - Lisp_Object dummy; +code_convert_region_unwind (arg) + Lisp_Object arg; { inhibit_pre_post_conversion = 0; + Vlast_coding_system_used = arg; return Qnil; } @@ -5295,6 +5443,9 @@ coding_restore_composition (coding, obj) int len = data[0] - 4, j; Lisp_Object args[MAX_COMPOSITION_COMPONENTS * 2 - 1]; + if (method == COMPOSITION_WITH_RULE_ALTCHARS + && len % 2 == 0) + len --; for (j = 0; j < len; j++) args[j] = make_number (data[4 + j]); components = (method == COMPOSITION_WITH_ALTCHARS @@ -5418,7 +5569,8 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) struct buffer *prev = current_buffer; Lisp_Object new; - record_unwind_protect (code_convert_region_unwind, Qnil); + record_unwind_protect (code_convert_region_unwind, + Vlast_coding_system_used); /* We should not call any more pre-write/post-read-conversion functions while this pre-write-conversion is running. */ inhibit_pre_post_conversion = 1; @@ -5776,16 +5928,22 @@ code_convert_region (from, from_byte, to, to_byte, coding, encodep, replace) && ! encodep && ! NILP (coding->post_read_conversion)) { Lisp_Object val; + Lisp_Object saved_coding_system; if (from != PT) TEMP_SET_PT_BOTH (from, from_byte); prev_Z = Z; - record_unwind_protect (code_convert_region_unwind, Qnil); + record_unwind_protect (code_convert_region_unwind, + Vlast_coding_system_used); + saved_coding_system = Vlast_coding_system_used; + Vlast_coding_system_used = coding->symbol; /* We should not call any more pre-write/post-read-conversion functions while this post-read-conversion is running. */ inhibit_pre_post_conversion = 1; val = call1 (coding->post_read_conversion, make_number (inserted)); inhibit_pre_post_conversion = 0; + coding->symbol = Vlast_coding_system_used; + Vlast_coding_system_used = saved_coding_system; /* Discard the unwind protect. */ specpdl_ptr--; CHECK_NUMBER (val); @@ -5831,7 +5989,8 @@ run_pre_post_conversion_on_str (str, coding, encodep) Lisp_Object old_deactivate_mark; record_unwind_protect (Fset_buffer, Fcurrent_buffer ()); - record_unwind_protect (code_convert_region_unwind, Qnil); + record_unwind_protect (code_convert_region_unwind, + Vlast_coding_system_used); /* It is not crucial to specbind this. */ old_deactivate_mark = Vdeactivate_mark; GCPRO2 (str, old_deactivate_mark); @@ -5839,12 +5998,13 @@ run_pre_post_conversion_on_str (str, coding, encodep) buffer = Fget_buffer_create (build_string (" *code-converting-work*")); buf = XBUFFER (buffer); + delete_all_overlays (buf); buf->directory = current_buffer->directory; buf->read_only = Qnil; buf->filename = Qnil; buf->undo_list = Qt; - buf->overlays_before = Qnil; - buf->overlays_after = Qnil; + eassert (buf->overlays_before == NULL); + eassert (buf->overlays_after == NULL); set_buffer_internal (buf); /* We must insert the contents of STR as is without @@ -5861,8 +6021,10 @@ run_pre_post_conversion_on_str (str, coding, encodep) call2 (coding->pre_write_conversion, make_number (BEG), make_number (Z)); else { + Vlast_coding_system_used = coding->symbol; TEMP_SET_PT_BOTH (BEG, BEG_BYTE); call1 (coding->post_read_conversion, make_number (Z - BEG)); + coding->symbol = Vlast_coding_system_used; } inhibit_pre_post_conversion = 0; Vdeactivate_mark = old_deactivate_mark; @@ -5946,7 +6108,9 @@ decode_coding_string (str, coding, nocopy) shrinked_bytes = from + (SBYTES (str) - to_byte); } - if (!require_decoding) + if (!require_decoding + && !(SYMBOLP (coding->post_read_conversion) + && !NILP (Ffboundp (coding->post_read_conversion)))) { coding->consumed = SBYTES (str); coding->consumed_char = SCHARS (str); @@ -6283,8 +6447,11 @@ detect_coding_system (src, src_bytes, highest, multibytep) DEFUN ("detect-coding-region", Fdetect_coding_region, Sdetect_coding_region, 2, 3, 0, - doc: /* Detect coding system of the text in the region between START and END. -Return a list of possible coding systems ordered by priority. + doc: /* Detect how the byte sequence in the region is encoded. +Return a list of possible coding systems used on decoding a byte +sequence containing the bytes in the region between START and END when +the coding system `undecided' is specified. The list is ordered by +priority decided in the current language environment. If only ASCII characters are found, it returns a list of single element `undecided' or its subsidiary coding system according to a detected @@ -6327,8 +6494,11 @@ highest priority. */) DEFUN ("detect-coding-string", Fdetect_coding_string, Sdetect_coding_string, 1, 2, 0, - doc: /* Detect coding system of the text in STRING. -Return a list of possible coding systems ordered by priority. + doc: /* Detect how the byte sequence in STRING is encoded. +Return a list of possible coding systems used on decoding a byte +sequence containing the bytes in STRING when the coding system +`undecided' is specified. The list is ordered by priority decided in +the current language environment. If only ASCII characters are found, it returns a list of single element `undecided' or its subsidiary coding system according to a detected @@ -6351,30 +6521,10 @@ highest priority. */) STRING_MULTIBYTE (string)); } -/* Return an intersection of lists L1 and L2. */ - -static Lisp_Object -intersection (l1, l2) - Lisp_Object l1, l2; -{ - Lisp_Object val = Fcons (Qnil, Qnil), tail; - - for (tail = val; CONSP (l1); l1 = XCDR (l1)) - { - if (!NILP (Fmemq (XCAR (l1), l2))) - { - XSETCDR (tail, Fcons (XCAR (l1), Qnil)); - tail = XCDR (tail); - } - } - return XCDR (val); -} - - /* Subroutine for Fsafe_coding_systems_region_internal. Return a list of coding systems that safely encode the multibyte - text between P and PEND. SAFE_CODINGS, if non-nil, is a list of + text between P and PEND. SAFE_CODINGS, if non-nil, is an alist of possible coding systems. If it is nil, it means that we have not yet found any coding systems. @@ -6390,8 +6540,9 @@ find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found) Lisp_Object safe_codings, work_table; int *single_byte_char_found; { - int c, len, idx; - Lisp_Object val; + int c, len; + Lisp_Object val, ch; + Lisp_Object prev, tail; while (p < pend) { @@ -6403,30 +6554,96 @@ find_safe_codings (p, pend, safe_codings, work_table, single_byte_char_found) if (SINGLE_BYTE_CHAR_P (c)) *single_byte_char_found = 1; if (NILP (safe_codings)) + /* Already all coding systems are excluded. But, we can't + terminate the loop here because non-ASCII single-byte char + must be found. */ continue; /* Check the safe coding systems for C. */ - val = char_table_ref_and_index (work_table, c, &idx); + ch = make_number (c); + val = Faref (work_table, ch); if (EQ (val, Qt)) /* This element was already checked. Ignore it. */ continue; /* Remember that we checked this element. */ - CHAR_TABLE_SET (work_table, make_number (idx), Qt); + Faset (work_table, ch, Qt); - /* If there are some safe coding systems for C and we have - already found the other set of coding systems for the - different characters, get the intersection of them. */ - if (!EQ (safe_codings, Qt) && !NILP (val)) - val = intersection (safe_codings, val); - safe_codings = val; + for (prev = tail = safe_codings; CONSP (tail); tail = XCDR (tail)) + { + Lisp_Object elt, translation_table, hash_table, accept_latin_extra; + int encodable; + + elt = XCAR (tail); + if (CONSP (XCDR (elt))) + { + /* This entry has this format now: + ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE + ACCEPT-LATIN-EXTRA ) */ + val = XCDR (elt); + encodable = ! NILP (Faref (XCAR (val), ch)); + if (! encodable) + { + val = XCDR (val); + translation_table = XCAR (val); + hash_table = XCAR (XCDR (val)); + accept_latin_extra = XCAR (XCDR (XCDR (val))); + } + } + else + { + /* This entry has this format now: ( CODING . SAFE-CHARS) */ + encodable = ! NILP (Faref (XCDR (elt), ch)); + if (! encodable) + { + /* Transform the format to: + ( CODING SAFE-CHARS TRANSLATION-TABLE HASH-TABLE + ACCEPT-LATIN-EXTRA ) */ + val = Fget (XCAR (elt), Qcoding_system); + translation_table + = Fplist_get (AREF (val, 3), + Qtranslation_table_for_encode); + if (SYMBOLP (translation_table)) + translation_table = Fget (translation_table, + Qtranslation_table); + hash_table + = (CHAR_TABLE_P (translation_table) + ? XCHAR_TABLE (translation_table)->extras[1] + : Qnil); + accept_latin_extra + = ((EQ (AREF (val, 0), make_number (2)) + && VECTORP (AREF (val, 4))) + ? AREF (AREF (val, 4), 16) + : Qnil); + XSETCAR (tail, list5 (XCAR (elt), XCDR (elt), + translation_table, hash_table, + accept_latin_extra)); + } + } + + if (! encodable + && ((CHAR_TABLE_P (translation_table) + && ! NILP (Faref (translation_table, ch))) + || (HASH_TABLE_P (hash_table) + && ! NILP (Fgethash (ch, hash_table, Qnil))) + || (SINGLE_BYTE_CHAR_P (c) + && ! NILP (accept_latin_extra) + && VECTORP (Vlatin_extra_code_table) + && ! NILP (AREF (Vlatin_extra_code_table, c))))) + encodable = 1; + if (encodable) + prev = tail; + else + { + /* Exclude this coding system from SAFE_CODINGS. */ + if (EQ (tail, safe_codings)) + safe_codings = XCDR (safe_codings); + else + XSETCDR (prev, XCDR (tail)); + } + } } return safe_codings; } - -/* Return a list of coding systems that safely encode the text between - START and END. If the text contains only ASCII or is unibyte, - return t. */ - DEFUN ("find-coding-systems-region-internal", Ffind_coding_systems_region_internal, Sfind_coding_systems_region_internal, 2, 2, 0, @@ -6485,28 +6702,35 @@ DEFUN ("find-coding-systems-region-internal", } /* The text contains non-ASCII characters. */ - work_table = Fcopy_sequence (Vchar_coding_system_table); - safe_codings = find_safe_codings (p1, p1end, Qt, work_table, + + work_table = Fmake_char_table (Qchar_coding_system, Qnil); + safe_codings = Fcopy_sequence (XCDR (Vcoding_system_safe_chars)); + + safe_codings = find_safe_codings (p1, p1end, safe_codings, work_table, &single_byte_char_found); if (p2 < p2end) safe_codings = find_safe_codings (p2, p2end, safe_codings, work_table, &single_byte_char_found); - - if (EQ (safe_codings, Qt)) - ; /* Nothing to be done. */ - else if (!single_byte_char_found) + if (EQ (safe_codings, XCDR (Vcoding_system_safe_chars))) + safe_codings = Qt; + else { - /* Append generic coding systems. */ - Lisp_Object args[2]; - args[0] = safe_codings; - args[1] = Fchar_table_extra_slot (Vchar_coding_system_table, - make_number (0)); - safe_codings = Fappend (2, args); + /* Turn safe_codings to a list of coding systems... */ + Lisp_Object val; + + if (single_byte_char_found) + /* ... and append these for eight-bit chars. */ + val = Fcons (Qraw_text, + Fcons (Qemacs_mule, Fcons (Qno_conversion, Qnil))); + else + /* ... and append generic coding systems. */ + val = Fcopy_sequence (XCAR (Vcoding_system_safe_chars)); + + for (; CONSP (safe_codings); safe_codings = XCDR (safe_codings)) + val = Fcons (XCAR (XCAR (safe_codings)), val); + safe_codings = val; } - else - safe_codings = Fcons (Qraw_text, - Fcons (Qemacs_mule, - Fcons (Qno_conversion, safe_codings))); + return safe_codings; } @@ -6587,7 +6811,10 @@ to the string. */) if (NILP (current_buffer->enable_multibyte_characters)) return Qnil; p = CHAR_POS_ADDR (from); - pend = CHAR_POS_ADDR (to); + if (to == GPT) + pend = GPT_ADDR; + else + pend = CHAR_POS_ADDR (to); } else { @@ -6622,7 +6849,7 @@ to the string. */) if (coding.type == coding_type_undecided) safe_chars = Qnil; else - safe_chars = coding_safe_chars (&coding); + safe_chars = coding_safe_chars (coding_system); if (STRINGP (string) || from >= GPT || to <= GPT) @@ -6909,8 +7136,7 @@ Return the corresponding character code in Big5. */) return val; } -DEFUN ("set-terminal-coding-system-internal", - Fset_terminal_coding_system_internal, +DEFUN ("set-terminal-coding-system-internal", Fset_terminal_coding_system_internal, Sset_terminal_coding_system_internal, 1, 1, 0, doc: /* Internal use only. */) (coding_system) @@ -6919,7 +7145,7 @@ DEFUN ("set-terminal-coding-system-internal", CHECK_SYMBOL (coding_system); setup_coding_system (Fcheck_coding_system (coding_system), &terminal_coding); /* We had better not send unsafe characters to terminal. */ - terminal_coding.flags |= CODING_FLAG_ISO_SAFE; + terminal_coding.mode |= CODING_MODE_INHIBIT_UNENCODABLE_CHAR; /* Character composition should be disabled. */ terminal_coding.composing = COMPOSITION_DISABLED; /* Error notification should be suppressed. */ @@ -6929,8 +7155,7 @@ DEFUN ("set-terminal-coding-system-internal", return Qnil; } -DEFUN ("set-safe-terminal-coding-system-internal", - Fset_safe_terminal_coding_system_internal, +DEFUN ("set-safe-terminal-coding-system-internal", Fset_safe_terminal_coding_system_internal, Sset_safe_terminal_coding_system_internal, 1, 1, 0, doc: /* Internal use only. */) (coding_system) @@ -6948,16 +7173,15 @@ DEFUN ("set-safe-terminal-coding-system-internal", return Qnil; } -DEFUN ("terminal-coding-system", - Fterminal_coding_system, Sterminal_coding_system, 0, 0, 0, +DEFUN ("terminal-coding-system", Fterminal_coding_system, + Sterminal_coding_system, 0, 0, 0, doc: /* Return coding system specified for terminal output. */) () { return terminal_coding.symbol; } -DEFUN ("set-keyboard-coding-system-internal", - Fset_keyboard_coding_system_internal, +DEFUN ("set-keyboard-coding-system-internal", Fset_keyboard_coding_system_internal, Sset_keyboard_coding_system_internal, 1, 1, 0, doc: /* Internal use only. */) (coding_system) @@ -6970,8 +7194,8 @@ DEFUN ("set-keyboard-coding-system-internal", return Qnil; } -DEFUN ("keyboard-coding-system", - Fkeyboard_coding_system, Skeyboard_coding_system, 0, 0, 0, +DEFUN ("keyboard-coding-system", Fkeyboard_coding_system, + Skeyboard_coding_system, 0, 0, 0, doc: /* Return coding system specified for decoding keyboard input. */) () { @@ -7143,6 +7367,40 @@ This function is internal use only. */) return Qnil; } +DEFUN ("define-coding-system-internal", Fdefine_coding_system_internal, + Sdefine_coding_system_internal, 1, 1, 0, + doc: /* Register CODING-SYSTEM as a base coding system. +This function is internal use only. */) + (coding_system) + Lisp_Object coding_system; +{ + Lisp_Object safe_chars, slot; + + if (NILP (Fcheck_coding_system (coding_system))) + Fsignal (Qcoding_system_error, Fcons (coding_system, Qnil)); + safe_chars = coding_safe_chars (coding_system); + if (! EQ (safe_chars, Qt) && ! CHAR_TABLE_P (safe_chars)) + error ("No valid safe-chars property for %s", + SDATA (SYMBOL_NAME (coding_system))); + if (EQ (safe_chars, Qt)) + { + if (NILP (Fmemq (coding_system, XCAR (Vcoding_system_safe_chars)))) + XSETCAR (Vcoding_system_safe_chars, + Fcons (coding_system, XCAR (Vcoding_system_safe_chars))); + } + else + { + slot = Fassq (coding_system, XCDR (Vcoding_system_safe_chars)); + if (NILP (slot)) + XSETCDR (Vcoding_system_safe_chars, + nconc2 (XCDR (Vcoding_system_safe_chars), + Fcons (Fcons (coding_system, safe_chars), Qnil))); + else + XSETCDR (slot, safe_chars); + } + return Qnil; +} + #endif /* emacs */ @@ -7296,9 +7554,12 @@ syms_of_coding () } } + Vcoding_system_safe_chars = Fcons (Qnil, Qnil); + staticpro (&Vcoding_system_safe_chars); + Qtranslation_table = intern ("translation-table"); staticpro (&Qtranslation_table); - Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (1)); + Fput (Qtranslation_table, Qchar_table_extra_slots, make_number (2)); Qtranslation_table_id = intern ("translation-table-id"); staticpro (&Qtranslation_table_id); @@ -7320,7 +7581,7 @@ syms_of_coding () But don't staticpro it here--that is done in alloc.c. */ Qchar_table_extra_slots = intern ("char-table-extra-slots"); Fput (Qsafe_chars, Qchar_table_extra_slots, make_number (0)); - Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (2)); + Fput (Qchar_coding_system, Qchar_table_extra_slots, make_number (0)); Qvalid_codes = intern ("valid-codes"); staticpro (&Qvalid_codes); @@ -7331,6 +7592,9 @@ syms_of_coding () Qraw_text = intern ("raw-text"); staticpro (&Qraw_text); + Qutf_8 = intern ("utf-8"); + staticpro (&Qutf_8); + defsubr (&Scoding_system_p); defsubr (&Sread_coding_system); defsubr (&Sread_non_nil_coding_system); @@ -7355,6 +7619,7 @@ syms_of_coding () defsubr (&Sfind_operation_coding_system); defsubr (&Supdate_coding_systems_internal); defsubr (&Sset_coding_priority_internal); + defsubr (&Sdefine_coding_system_internal); DEFVAR_LISP ("coding-system-list", &Vcoding_system_list, doc: /* List of coding systems. @@ -7415,7 +7680,9 @@ the value of `buffer-file-coding-system' is used. */); Vcoding_system_for_write = Qnil; DEFVAR_LISP ("last-coding-system-used", &Vlast_coding_system_used, - doc: /* Coding system used in the latest file or process I/O. */); + doc: /* Coding system used in the latest file or process I/O. +Also set by `encode-coding-region', `decode-coding-region', +`encode-coding-string' and `decode-coding-string'. */); Vlast_coding_system_used = Qnil; DEFVAR_BOOL ("inhibit-eol-conversion", &inhibit_eol_conversion, @@ -7549,11 +7816,14 @@ coding system used in each operation can't encode the text. The default value is `select-safe-coding-system' (which see). */); Vselect_safe_coding_system_function = Qnil; - DEFVAR_LISP ("char-coding-system-table", &Vchar_coding_system_table, - doc: /* Char-table containing safe coding systems of each characters. -Each element doesn't include such generic coding systems that can -encode any characters. They are in the first extra slot. */); - Vchar_coding_system_table = Fmake_char_table (Qchar_coding_system, Qnil); + DEFVAR_BOOL ("coding-system-require-warning", + &coding_system_require_warning, + doc: /* Internal use only. +If non-nil, on writing a file, `select-safe-coding-system-function' is +called even if `coding-system-for-write' is non-nil. The command +`universal-coding-system-argument' binds this variable to t temporarily. */); + coding_system_require_warning = 0; + DEFVAR_BOOL ("inhibit-iso-escape-detection", &inhibit_iso_escape_detection, @@ -7581,6 +7851,12 @@ The other way to read escape sequences in a file without decoding is to explicitly specify some coding system that doesn't use ISO2022's escape sequence (e.g `latin-1') on reading by \\[universal-coding-system-argument]. */); inhibit_iso_escape_detection = 0; + + DEFVAR_LISP ("translation-table-for-input", &Vtranslation_table_for_input, + doc: /* Char table for translating self-inserting characters. +This is applied to the result of input methods, not their input. See also +`keyboard-translate-table'. */); + Vtranslation_table_for_input = Qnil; } char * @@ -7605,3 +7881,5 @@ emacs_strerror (error_number) #endif /* emacs */ +/* arch-tag: 3a3a2b01-5ff6-4071-9afe-f5b808d9229d + (do not change this comment) */