X-Git-Url: http://git.hcoop.net/bpt/emacs.git/blobdiff_plain/a3cbb6314b5718226109c741f5c916684de6b935..971de7fb158335fbda39525feb2d7776a26bc030:/src/character.c diff --git a/src/character.c b/src/character.c index 062adeb57b..a6c38df9e8 100644 --- a/src/character.c +++ b/src/character.c @@ -1,9 +1,9 @@ /* Basic character support. Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN. Licensed to the Free Software Foundation. - Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008 + Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 Free Software Foundation, Inc. - Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008 + Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010 National Institute of Advanced Industrial Science and Technology (AIST) Registration Number H13PRO009 @@ -34,6 +34,7 @@ along with GNU Emacs. If not, see . */ #ifdef emacs #include +#include #include "lisp.h" #include "character.h" #include "buffer.h" @@ -59,7 +60,7 @@ Lisp_Object Vauto_fill_chars; Lisp_Object Qauto_fill_chars; /* Char-table of information about which character to unify to which - Unicode character. */ + Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */ Lisp_Object Vchar_unify_table; /* A char-table. An element is non-nil iff the corresponding @@ -86,22 +87,13 @@ Lisp_Object Vscript_representative_chars; static Lisp_Object Qchar_script_table; Lisp_Object Vunicode_category_table; - -/* Mapping table from unibyte chars to multibyte chars. */ -int unibyte_to_multibyte_table[256]; - -/* Nth element is 1 iff unibyte char N can be mapped to a multibyte - char. */ -char unibyte_has_multibyte_table[256]; - /* If character code C has modifier masks, reflect them to the character code if possible. Return the resulting code. */ int -char_resolve_modifier_mask (c) - int c; +char_resolve_modifier_mask (int c) { /* A non-ASCII character can't reflect modifier bits to the code. */ if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK))) @@ -134,11 +126,13 @@ char_resolve_modifier_mask (c) else if ((c & 0177) >= 0100 && (c & 0177) <= 0137) c &= (037 | (~0177 & ~CHAR_CTL)); } +#if 0 /* This is outside the scope of this function. (bug#4751) */ if (c & CHAR_META) { /* Move the meta bit to the right place for a string. */ c = (c & ~CHAR_META) | 0x80; } +#endif return c; } @@ -148,9 +142,7 @@ char_resolve_modifier_mask (c) handle them appropriately. */ int -char_string (c, p) - unsigned c; - unsigned char *p; +char_string (unsigned int c, unsigned char *p) { int bytes; @@ -204,10 +196,7 @@ char_string (c, p) character) of the multibyte form. */ int -string_char (p, advanced, len) - const unsigned char *p; - const unsigned char **advanced; - int *len; +string_char (const unsigned char *p, const unsigned char **advanced, int *len) { int c; const unsigned char *saved_p = p; @@ -250,9 +239,7 @@ string_char (p, advanced, len) case, translace C by all tables. */ int -translate_char (table, c) - Lisp_Object table; - int c; +translate_char (Lisp_Object table, int c) { if (CHAR_TABLE_P (table)) { @@ -270,43 +257,33 @@ translate_char (table, c) return c; } -/* Convert the multibyte character C to unibyte 8-bit character based - on the current value of charset_unibyte. If dimension of - charset_unibyte is more than one, return (C & 0xFF). +/* Convert ASCII or 8-bit character C to unibyte. If C is none of + them, return (C & 0xFF). The argument REV_TBL is now ignored. It will be removed in the future. */ int -multibyte_char_to_unibyte (c, rev_tbl) - int c; - Lisp_Object rev_tbl; +multibyte_char_to_unibyte (int c, Lisp_Object rev_tbl) { - struct charset *charset; - unsigned c1; - + if (c < 0x80) + return c; if (CHAR_BYTE8_P (c)) return CHAR_TO_BYTE8 (c); - charset = CHARSET_FROM_ID (charset_unibyte); - c1 = ENCODE_CHAR (charset, c); - return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF); + return (c & 0xFF); } /* Like multibyte_char_to_unibyte, but return -1 if C is not supported by charset_unibyte. */ int -multibyte_char_to_unibyte_safe (c) - int c; +multibyte_char_to_unibyte_safe (int c) { - struct charset *charset; - unsigned c1; - + if (c < 0x80) + return c; if (CHAR_BYTE8_P (c)) return CHAR_TO_BYTE8 (c); - charset = CHARSET_FROM_ID (charset_unibyte); - c1 = ENCODE_CHAR (charset, c); - return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1); + return -1; } DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0, @@ -331,16 +308,12 @@ DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte, Lisp_Object ch; { int c; - struct charset *charset; CHECK_CHARACTER (ch); c = XFASTINT (ch); - if (c >= 0400) - error ("Invalid unibyte character: %d", c); - charset = CHARSET_FROM_ID (charset_unibyte); - c = DECODE_CHAR (charset, c); - if (c < 0) - c = BYTE8_TO_CHAR (XFASTINT (ch)); + if (c >= 0x100) + error ("Not a unibyte character: %d", c); + MAKE_CHAR_MULTIBYTE (c); return make_number (c); } @@ -411,9 +384,7 @@ usage: (char-width CHAR) */) respectively. */ int -c_string_width (str, len, precision, nchars, nbytes) - const unsigned char *str; - int precision, *nchars, *nbytes; +c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes) { int i = 0, i_byte = 0; int width = 0; @@ -423,7 +394,7 @@ c_string_width (str, len, precision, nchars, nbytes) { int bytes, thiswidth; Lisp_Object val; - int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes); + int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes); if (dp) { @@ -464,9 +435,7 @@ c_string_width (str, len, precision, nchars, nbytes) occupies on the screen. */ int -strwidth (str, len) - unsigned char *str; - int len; +strwidth (unsigned char *str, int len) { return c_string_width (str, len, -1, NULL, NULL); } @@ -479,9 +448,7 @@ strwidth (str, len) in *NCHARS and *NBYTES respectively. */ int -lisp_string_width (string, precision, nchars, nbytes) - Lisp_Object string; - int precision, *nchars, *nbytes; +lisp_string_width (Lisp_Object string, int precision, int *nchars, int *nbytes) { int len = SCHARS (string); /* This set multibyte to 0 even if STRING is multibyte when it @@ -513,7 +480,7 @@ lisp_string_width (string, precision, nchars, nbytes) int c; if (multibyte) - c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes); + c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes); else c = str[i_byte], bytes = 1; chars = 1; @@ -591,9 +558,7 @@ usage: (char-direction CHAR) */) nil, we treat each byte as a character. */ EMACS_INT -chars_in_text (ptr, nbytes) - const unsigned char *ptr; - EMACS_INT nbytes; +chars_in_text (const unsigned char *ptr, EMACS_INT nbytes) { /* current_buffer is null at early stages of Emacs initialization. */ if (current_buffer == 0 @@ -609,9 +574,7 @@ chars_in_text (ptr, nbytes) ignores enable-multibyte-characters. */ EMACS_INT -multibyte_chars_in_text (ptr, nbytes) - const unsigned char *ptr; - EMACS_INT nbytes; +multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes) { const unsigned char *endp = ptr + nbytes; int chars = 0; @@ -636,9 +599,7 @@ multibyte_chars_in_text (ptr, nbytes) represented by 2-byte in a multibyte text. */ void -parse_str_as_multibyte (str, len, nchars, nbytes) - const unsigned char *str; - int len, *nchars, *nbytes; +parse_str_as_multibyte (const unsigned char *str, int len, int *nchars, int *nbytes) { const unsigned char *endp = str + len; int n, chars = 0, bytes = 0; @@ -648,7 +609,8 @@ parse_str_as_multibyte (str, len, nchars, nbytes) const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; while (str < adjusted_endp) { - if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0) + if (! CHAR_BYTE8_HEAD_P (*str) + && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0) str += n, bytes += n; else str++, bytes += 2; @@ -657,7 +619,8 @@ parse_str_as_multibyte (str, len, nchars, nbytes) } while (str < endp) { - if ((n = MULTIBYTE_LENGTH (str, endp)) > 0) + if (! CHAR_BYTE8_HEAD_P (*str) + && (n = MULTIBYTE_LENGTH (str, endp)) > 0) str += n, bytes += n; else str++, bytes += 2; @@ -678,9 +641,7 @@ parse_str_as_multibyte (str, len, nchars, nbytes) resulting text. */ int -str_as_multibyte (str, len, nbytes, nchars) - unsigned char *str; - int len, nbytes, *nchars; +str_as_multibyte (unsigned char *str, int len, int nbytes, int *nchars) { unsigned char *p = str, *endp = str + nbytes; unsigned char *to; @@ -691,10 +652,13 @@ str_as_multibyte (str, len, nbytes, nchars) { unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; while (p < adjusted_endp + && ! CHAR_BYTE8_HEAD_P (*p) && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) p += n, chars++; } - while ((n = MULTIBYTE_LENGTH (p, endp)) > 0) + while (p < endp + && ! CHAR_BYTE8_HEAD_P (*p) + && (n = MULTIBYTE_LENGTH (p, endp)) > 0) p += n, chars++; if (nchars) *nchars = chars; @@ -712,7 +676,8 @@ str_as_multibyte (str, len, nbytes, nchars) unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH; while (p < adjusted_endp) { - if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) + if (! CHAR_BYTE8_HEAD_P (*p) + && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0) { while (n--) *to++ = *p++; @@ -728,7 +693,8 @@ str_as_multibyte (str, len, nbytes, nchars) } while (p < endp) { - if ((n = MULTIBYTE_LENGTH (p, endp)) > 0) + if (! CHAR_BYTE8_HEAD_P (*p) + && (n = MULTIBYTE_LENGTH (p, endp)) > 0) { while (n--) *to++ = *p++; @@ -751,9 +717,7 @@ str_as_multibyte (str, len, nbytes, nchars) `str_to_multibyte'. */ int -parse_str_to_multibyte (str, len) - unsigned char *str; - int len; +parse_str_to_multibyte (unsigned char *str, int len) { unsigned char *endp = str + len; int bytes; @@ -771,9 +735,7 @@ parse_str_to_multibyte (str, len) enough. */ int -str_to_multibyte (str, len, bytes) - unsigned char *str; - int len, bytes; +str_to_multibyte (unsigned char *str, int len, int bytes) { unsigned char *p = str, *endp = str + bytes; unsigned char *to; @@ -802,9 +764,7 @@ str_to_multibyte (str, len, bytes) unibyte. */ int -str_as_unibyte (str, bytes) - unsigned char *str; - int bytes; +str_as_unibyte (unsigned char *str, int bytes) { const unsigned char *p = str, *endp = str + bytes; unsigned char *to; @@ -846,11 +806,7 @@ str_as_unibyte (str, bytes) Note: Currently the arg ACCEPT_LATIN_1 is not used. */ EMACS_INT -str_to_unibyte (src, dst, chars, accept_latin_1) - const unsigned char *src; - unsigned char *dst; - EMACS_INT chars; - int accept_latin_1; +str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1) { EMACS_INT i; @@ -870,8 +826,7 @@ str_to_unibyte (src, dst, chars, accept_latin_1) int -string_count_byte8 (string) - Lisp_Object string; +string_count_byte8 (Lisp_Object string) { int multibyte = STRING_MULTIBYTE (string); int nbytes = SBYTES (string); @@ -901,8 +856,7 @@ string_count_byte8 (string) Lisp_Object -string_escape_byte8 (string) - Lisp_Object string; +string_escape_byte8 (Lisp_Object string) { int nchars = SCHARS (string); int nbytes = SBYTES (string); @@ -972,10 +926,13 @@ usage: (string &rest CHARACTERS) */) int n; Lisp_Object *args; { - int i; - unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n); - unsigned char *p = buf; - int c; + int i, c; + unsigned char *buf, *p; + Lisp_Object str; + USE_SAFE_ALLOCA; + + SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n); + p = buf; for (i = 0; i < n; i++) { @@ -984,7 +941,9 @@ usage: (string &rest CHARACTERS) */) p += CHAR_STRING (c, p); } - return make_string_from_bytes ((char *) buf, n, p - buf); + str = make_string_from_bytes ((char *) buf, n, p - buf); + SAFE_FREE (); + return str; } DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0, @@ -994,10 +953,13 @@ usage: (unibyte-string &rest BYTES) */) int n; Lisp_Object *args; { - int i; - unsigned char *buf = (unsigned char *) alloca (n); - unsigned char *p = buf; - unsigned c; + int i, c; + unsigned char *buf, *p; + Lisp_Object str; + USE_SAFE_ALLOCA; + + SAFE_ALLOCA (buf, unsigned char *, n); + p = buf; for (i = 0; i < n; i++) { @@ -1008,15 +970,17 @@ usage: (unibyte-string &rest BYTES) */) *p++ = c; } - return make_string_from_bytes ((char *) buf, n, p - buf); + str = make_string_from_bytes ((char *) buf, n, p - buf); + SAFE_FREE (); + return str; } -DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers, +DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers, Schar_resolve_modifiers, 1, 1, 0, doc: /* Resolve modifiers in the character CHAR. The value is a character with modifiers resolved into the character code. Unresolved modifiers are kept in the value. -usage: (char-resolve-modifers CHAR) */) +usage: (char-resolve-modifiers CHAR) */) (character) Lisp_Object character; { @@ -1027,15 +991,76 @@ usage: (char-resolve-modifers CHAR) */) return make_number (char_resolve_modifier_mask (c)); } +DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0, + doc: /* Return a byte value of a character at point. +Optional 1st arg POSITION, if non-nil, is a position of a character to get +a byte value. +Optional 2nd arg STRING, if non-nil, is a string of which first +character is a target to get a byte value. In this case, POSITION, if +non-nil, is an index of a target character in the string. + +If the current buffer (or STRING) is multibyte, and the target +character is not ASCII nor 8-bit character, an error is signalled. */) + (position, string) + Lisp_Object position, string; +{ + int c; + EMACS_INT pos; + unsigned char *p; + + if (NILP (string)) + { + if (NILP (position)) + { + p = PT_ADDR; + } + else + { + CHECK_NUMBER_COERCE_MARKER (position); + if (XINT (position) < BEGV || XINT (position) >= ZV) + args_out_of_range_3 (position, make_number (BEGV), make_number (ZV)); + pos = XFASTINT (position); + p = CHAR_POS_ADDR (pos); + } + if (NILP (current_buffer->enable_multibyte_characters)) + return make_number (*p); + } + else + { + CHECK_STRING (string); + if (NILP (position)) + { + p = SDATA (string); + } + else + { + CHECK_NATNUM (position); + if (XINT (position) >= SCHARS (string)) + args_out_of_range (string, position); + pos = XFASTINT (position); + p = SDATA (string) + string_char_to_byte (string, pos); + } + if (! STRING_MULTIBYTE (string)) + return make_number (*p); + } + c = STRING_CHAR (p); + if (CHAR_BYTE8_P (c)) + c = CHAR_TO_BYTE8 (c); + else if (! ASCII_CHAR_P (c)) + error ("Not an ASCII nor an 8-bit character: %d", c); + return make_number (c); +} + + void -init_character_once () +init_character_once (void) { } #ifdef emacs void -syms_of_character () +syms_of_character (void) { DEFSYM (Qcharacterp, "characterp"); DEFSYM (Qauto_fill_chars, "auto-fill-chars"); @@ -1054,6 +1079,7 @@ syms_of_character () defsubr (&Sstring); defsubr (&Sunibyte_string); defsubr (&Schar_resolve_modifiers); + defsubr (&Sget_byte); DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector, doc: /* @@ -1098,19 +1124,25 @@ It has one extra slot whose value is a list of script symbols. */); /* Intern this now in case it isn't already done. Setting this variable twice is harmless. But don't staticpro it here--that is done in alloc.c. */ - Qchar_table_extra_slots = intern ("char-table-extra-slots"); + Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots"); DEFSYM (Qchar_script_table, "char-script-table"); Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1)); Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil); DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars, - doc: /* Alist of scripts vs the representative characters. */); + doc: /* Alist of scripts vs the representative characters. +Each element is a cons (SCRIPT . CHARS). +SCRIPT is a symbol representing a script or a subgroup of a script. +CHARS is a list or a vector of characters. +If it is a list, all characters in the list are necessary for supporting SCRIPT. +If it is a vector, one of the characters in the vector is necessary. +This variable is used to find a font for a specific script. */); Vscript_representative_chars = Qnil; DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table, doc: /* Char table of Unicode's "General Category". -All Unicode characters has one of the following values (symbol): - Lw, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, +All Unicode characters have one of the following values (symbol): + Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po, Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn See The Unicode Standard for the meaning of those values. */); /* The correct char-table is setup in characters.el. */