/* Basic character support.
Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
Licensed to the Free Software Foundation.
- Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
+ Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
Free Software Foundation, Inc.
- Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
+ Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
National Institute of Advanced Industrial Science and Technology (AIST)
Registration Number H13PRO009
#ifdef emacs
#include <sys/types.h>
+#include <setjmp.h>
#include "lisp.h"
#include "character.h"
#include "buffer.h"
Lisp_Object Qauto_fill_chars;
/* Char-table of information about which character to unify to which
- Unicode character. */
+ Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
Lisp_Object Vchar_unify_table;
/* A char-table. An element is non-nil iff the corresponding
static Lisp_Object Qchar_script_table;
-/* Mapping table from unibyte chars to multibyte chars. */
-int unibyte_to_multibyte_table[256];
-
-/* Nth element is 1 iff unibyte char N can be mapped to a multibyte
- char. */
-char unibyte_has_multibyte_table[256];
-
+Lisp_Object Vunicode_category_table;
\f
/* If character code C has modifier masks, reflect them to the
character code if possible. Return the resulting code. */
int
-char_resolve_modifier_mask (c)
- int c;
+char_resolve_modifier_mask (int c)
{
/* A non-ASCII character can't reflect modifier bits to the code. */
if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
c &= (037 | (~0177 & ~CHAR_CTL));
}
+#if 0 /* This is outside the scope of this function. (bug#4751) */
if (c & CHAR_META)
{
/* Move the meta bit to the right place for a string. */
c = (c & ~CHAR_META) | 0x80;
}
+#endif
return c;
}
handle them appropriately. */
int
-char_string (c, p)
- unsigned c;
- unsigned char *p;
+char_string (unsigned int c, unsigned char *p)
{
int bytes;
character) of the multibyte form. */
int
-string_char (p, advanced, len)
- const unsigned char *p;
- const unsigned char **advanced;
- int *len;
+string_char (const unsigned char *p, const unsigned char **advanced, int *len)
{
int c;
const unsigned char *saved_p = p;
case, translace C by all tables. */
int
-translate_char (table, c)
- Lisp_Object table;
- int c;
+translate_char (Lisp_Object table, int c)
{
if (CHAR_TABLE_P (table))
{
return c;
}
-/* Convert the multibyte character C to unibyte 8-bit character based
- on the current value of charset_unibyte. If dimension of
- charset_unibyte is more than one, return (C & 0xFF).
+/* Convert ASCII or 8-bit character C to unibyte. If C is none of
+ them, return (C & 0xFF).
The argument REV_TBL is now ignored. It will be removed in the
future. */
int
-multibyte_char_to_unibyte (c, rev_tbl)
- int c;
- Lisp_Object rev_tbl;
+multibyte_char_to_unibyte (int c, Lisp_Object rev_tbl)
{
- struct charset *charset;
- unsigned c1;
-
+ if (c < 0x80)
+ return c;
if (CHAR_BYTE8_P (c))
return CHAR_TO_BYTE8 (c);
- charset = CHARSET_FROM_ID (charset_unibyte);
- c1 = ENCODE_CHAR (charset, c);
- return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
+ return (c & 0xFF);
}
/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
by charset_unibyte. */
int
-multibyte_char_to_unibyte_safe (c)
- int c;
+multibyte_char_to_unibyte_safe (int c)
{
- struct charset *charset;
- unsigned c1;
-
+ if (c < 0x80)
+ return c;
if (CHAR_BYTE8_P (c))
return CHAR_TO_BYTE8 (c);
- charset = CHARSET_FROM_ID (charset_unibyte);
- c1 = ENCODE_CHAR (charset, c);
- return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
+ return -1;
}
DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
Lisp_Object ch;
{
int c;
- struct charset *charset;
CHECK_CHARACTER (ch);
c = XFASTINT (ch);
- if (c >= 0400)
- error ("Invalid unibyte character: %d", c);
- charset = CHARSET_FROM_ID (charset_unibyte);
- c = DECODE_CHAR (charset, c);
- if (c < 0)
- c = BYTE8_TO_CHAR (XFASTINT (ch));
+ if (c >= 0x100)
+ error ("Not a unibyte character: %d", c);
+ MAKE_CHAR_MULTIBYTE (c);
return make_number (c);
}
respectively. */
int
-c_string_width (str, len, precision, nchars, nbytes)
- const unsigned char *str;
- int precision, *nchars, *nbytes;
+c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
{
int i = 0, i_byte = 0;
int width = 0;
{
int bytes, thiswidth;
Lisp_Object val;
- int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
+ int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
if (dp)
{
occupies on the screen. */
int
-strwidth (str, len)
- unsigned char *str;
- int len;
+strwidth (unsigned char *str, int len)
{
return c_string_width (str, len, -1, NULL, NULL);
}
in *NCHARS and *NBYTES respectively. */
int
-lisp_string_width (string, precision, nchars, nbytes)
- Lisp_Object string;
- int precision, *nchars, *nbytes;
+lisp_string_width (Lisp_Object string, int precision, int *nchars, int *nbytes)
{
int len = SCHARS (string);
/* This set multibyte to 0 even if STRING is multibyte when it
int c;
if (multibyte)
- c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
+ c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
else
c = str[i_byte], bytes = 1;
chars = 1;
nil, we treat each byte as a character. */
EMACS_INT
-chars_in_text (ptr, nbytes)
- const unsigned char *ptr;
- EMACS_INT nbytes;
+chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
{
/* current_buffer is null at early stages of Emacs initialization. */
if (current_buffer == 0
ignores enable-multibyte-characters. */
EMACS_INT
-multibyte_chars_in_text (ptr, nbytes)
- const unsigned char *ptr;
- EMACS_INT nbytes;
+multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
{
const unsigned char *endp = ptr + nbytes;
int chars = 0;
represented by 2-byte in a multibyte text. */
void
-parse_str_as_multibyte (str, len, nchars, nbytes)
- const unsigned char *str;
- int len, *nchars, *nbytes;
+parse_str_as_multibyte (const unsigned char *str, int len, int *nchars, int *nbytes)
{
const unsigned char *endp = str + len;
int n, chars = 0, bytes = 0;
const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
while (str < adjusted_endp)
{
- if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
+ if (! CHAR_BYTE8_HEAD_P (*str)
+ && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
str += n, bytes += n;
else
str++, bytes += 2;
}
while (str < endp)
{
- if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
+ if (! CHAR_BYTE8_HEAD_P (*str)
+ && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
str += n, bytes += n;
else
str++, bytes += 2;
resulting text. */
int
-str_as_multibyte (str, len, nbytes, nchars)
- unsigned char *str;
- int len, nbytes, *nchars;
+str_as_multibyte (unsigned char *str, int len, int nbytes, int *nchars)
{
unsigned char *p = str, *endp = str + nbytes;
unsigned char *to;
{
unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
while (p < adjusted_endp
+ && ! CHAR_BYTE8_HEAD_P (*p)
&& (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
p += n, chars++;
}
- while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
+ while (p < endp
+ && ! CHAR_BYTE8_HEAD_P (*p)
+ && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
p += n, chars++;
if (nchars)
*nchars = chars;
unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
while (p < adjusted_endp)
{
- if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
+ if (! CHAR_BYTE8_HEAD_P (*p)
+ && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
{
while (n--)
*to++ = *p++;
}
while (p < endp)
{
- if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
+ if (! CHAR_BYTE8_HEAD_P (*p)
+ && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
{
while (n--)
*to++ = *p++;
`str_to_multibyte'. */
int
-parse_str_to_multibyte (str, len)
- unsigned char *str;
- int len;
+parse_str_to_multibyte (unsigned char *str, int len)
{
unsigned char *endp = str + len;
int bytes;
enough. */
int
-str_to_multibyte (str, len, bytes)
- unsigned char *str;
- int len, bytes;
+str_to_multibyte (unsigned char *str, int len, int bytes)
{
unsigned char *p = str, *endp = str + bytes;
unsigned char *to;
unibyte. */
int
-str_as_unibyte (str, bytes)
- unsigned char *str;
- int bytes;
+str_as_unibyte (unsigned char *str, int bytes)
{
const unsigned char *p = str, *endp = str + bytes;
unsigned char *to;
Note: Currently the arg ACCEPT_LATIN_1 is not used. */
EMACS_INT
-str_to_unibyte (src, dst, chars, accept_latin_1)
- const unsigned char *src;
- unsigned char *dst;
- EMACS_INT chars;
- int accept_latin_1;
+str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
{
EMACS_INT i;
int
-string_count_byte8 (string)
- Lisp_Object string;
+string_count_byte8 (Lisp_Object string)
{
int multibyte = STRING_MULTIBYTE (string);
int nbytes = SBYTES (string);
Lisp_Object
-string_escape_byte8 (string)
- Lisp_Object string;
+string_escape_byte8 (Lisp_Object string)
{
int nchars = SCHARS (string);
int nbytes = SBYTES (string);
int n;
Lisp_Object *args;
{
- int i;
- unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
- unsigned char *p = buf;
- int c;
+ int i, c;
+ unsigned char *buf, *p;
+ Lisp_Object str;
+ USE_SAFE_ALLOCA;
+
+ SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
+ p = buf;
for (i = 0; i < n; i++)
{
p += CHAR_STRING (c, p);
}
- return make_string_from_bytes ((char *) buf, n, p - buf);
+ str = make_string_from_bytes ((char *) buf, n, p - buf);
+ SAFE_FREE ();
+ return str;
}
DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
int n;
Lisp_Object *args;
{
- int i;
- unsigned char *buf = (unsigned char *) alloca (n);
- unsigned char *p = buf;
- unsigned c;
+ int i, c;
+ unsigned char *buf, *p;
+ Lisp_Object str;
+ USE_SAFE_ALLOCA;
+
+ SAFE_ALLOCA (buf, unsigned char *, n);
+ p = buf;
for (i = 0; i < n; i++)
{
*p++ = c;
}
- return make_string_from_bytes ((char *) buf, n, p - buf);
+ str = make_string_from_bytes ((char *) buf, n, p - buf);
+ SAFE_FREE ();
+ return str;
}
-DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers,
+DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
Schar_resolve_modifiers, 1, 1, 0,
doc: /* Resolve modifiers in the character CHAR.
The value is a character with modifiers resolved into the character
code. Unresolved modifiers are kept in the value.
-usage: (char-resolve-modifers CHAR) */)
+usage: (char-resolve-modifiers CHAR) */)
(character)
Lisp_Object character;
{
return make_number (char_resolve_modifier_mask (c));
}
+DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
+ doc: /* Return a byte value of a character at point.
+Optional 1st arg POSITION, if non-nil, is a position of a character to get
+a byte value.
+Optional 2nd arg STRING, if non-nil, is a string of which first
+character is a target to get a byte value. In this case, POSITION, if
+non-nil, is an index of a target character in the string.
+
+If the current buffer (or STRING) is multibyte, and the target
+character is not ASCII nor 8-bit character, an error is signalled. */)
+ (position, string)
+ Lisp_Object position, string;
+{
+ int c;
+ EMACS_INT pos;
+ unsigned char *p;
+
+ if (NILP (string))
+ {
+ if (NILP (position))
+ {
+ p = PT_ADDR;
+ }
+ else
+ {
+ CHECK_NUMBER_COERCE_MARKER (position);
+ if (XINT (position) < BEGV || XINT (position) >= ZV)
+ args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
+ pos = XFASTINT (position);
+ p = CHAR_POS_ADDR (pos);
+ }
+ if (NILP (current_buffer->enable_multibyte_characters))
+ return make_number (*p);
+ }
+ else
+ {
+ CHECK_STRING (string);
+ if (NILP (position))
+ {
+ p = SDATA (string);
+ }
+ else
+ {
+ CHECK_NATNUM (position);
+ if (XINT (position) >= SCHARS (string))
+ args_out_of_range (string, position);
+ pos = XFASTINT (position);
+ p = SDATA (string) + string_char_to_byte (string, pos);
+ }
+ if (! STRING_MULTIBYTE (string))
+ return make_number (*p);
+ }
+ c = STRING_CHAR (p);
+ if (CHAR_BYTE8_P (c))
+ c = CHAR_TO_BYTE8 (c);
+ else if (! ASCII_CHAR_P (c))
+ error ("Not an ASCII nor an 8-bit character: %d", c);
+ return make_number (c);
+}
+
+
void
-init_character_once ()
+init_character_once (void)
{
}
#ifdef emacs
void
-syms_of_character ()
+syms_of_character (void)
{
DEFSYM (Qcharacterp, "characterp");
DEFSYM (Qauto_fill_chars, "auto-fill-chars");
defsubr (&Sstring);
defsubr (&Sunibyte_string);
defsubr (&Schar_resolve_modifiers);
+ defsubr (&Sget_byte);
DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
doc: /*
/* Intern this now in case it isn't already done.
Setting this variable twice is harmless.
But don't staticpro it here--that is done in alloc.c. */
- Qchar_table_extra_slots = intern ("char-table-extra-slots");
+ Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
DEFSYM (Qchar_script_table, "char-script-table");
Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
- doc: /* Alist of scripts vs the representative characters. */);
+ doc: /* Alist of scripts vs the representative characters.
+Each element is a cons (SCRIPT . CHARS).
+SCRIPT is a symbol representing a script or a subgroup of a script.
+CHARS is a list or a vector of characters.
+If it is a list, all characters in the list are necessary for supporting SCRIPT.
+If it is a vector, one of the characters in the vector is necessary.
+This variable is used to find a font for a specific script. */);
Vscript_representative_chars = Qnil;
+
+ DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
+ doc: /* Char table of Unicode's "General Category".
+All Unicode characters have one of the following values (symbol):
+ Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
+ Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
+See The Unicode Standard for the meaning of those values. */);
+ /* The correct char-table is setup in characters.el. */
+ Vunicode_category_table = Qnil;
}
#endif /* emacs */