1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001 Free Software Foundation, Inc.
5 Copyright (C) 2001, 2002
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
9 This file is part of GNU Emacs.
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
26 /* At first, see the document in `character.h' to understand the code
37 #include <sys/types.h>
39 #include "character.h"
42 #include "composite.h"
51 Lisp_Object Qcharacterp
;
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector
;
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars
;
60 Lisp_Object Qauto_fill_chars
;
62 Lisp_Object Vchar_unify_table
;
64 /* A char-table. An element is non-nil iff the corresponding
65 character has a printable glyph. */
66 Lisp_Object Vprintable_chars
;
68 /* A char-table. An elemnent is a column-width of the corresponding
70 Lisp_Object Vchar_width_table
;
72 /* A char-table. An element is a symbol indicating the direction
73 property of corresponding character. */
74 Lisp_Object Vchar_direction_table
;
76 /* Variables used locally in the macro FETCH_MULTIBYTE_CHAR. */
77 unsigned char *_fetch_multibyte_char_p
;
78 int _fetch_multibyte_char_len
;
83 char_string_with_unification (c
, p
, advanced
)
85 unsigned char *p
, **advanced
;
91 if (c
<= MAX_3_BYTE_CHAR
|| c
> MAX_5_BYTE_CHAR
)
93 bytes
= CHAR_STRING (c
, p
);
95 else if (c
<= MAX_4_BYTE_CHAR
)
97 p
[0] = (0xF0 | (c
>> 18));
98 p
[1] = (0x80 | ((c
>> 12) & 0x3F));
99 p
[2] = (0x80 | ((c
>> 6) & 0x3F));
100 p
[3] = (0x80 | (c
& 0x3F));
106 p
[1] = (0x80 | ((c
>> 18) & 0x0F));
107 p
[2] = (0x80 | ((c
>> 12) & 0x3F));
108 p
[3] = (0x80 | ((c
>> 6) & 0x3F));
109 p
[4] = (0x80 | (c
& 0x3F));
113 *advanced
= p
+ bytes
;
119 string_char_with_unification (p
, advanced
, len
)
120 unsigned char *p
, **advanced
;
124 unsigned char *saved_p
= p
;
126 if (*p
< 0x80 || ! (*p
& 0x20) || ! (*p
& 0x10))
128 c
= STRING_CHAR_ADVANCE (p
);
130 else if (! (*p
& 0x08))
132 c
= ((((p
)[0] & 0xF) << 18)
133 | (((p
)[1] & 0x3F) << 12)
134 | (((p
)[2] & 0x3F) << 6)
140 c
= ((((p
)[1] & 0x3F) << 18)
141 | (((p
)[2] & 0x3F) << 12)
142 | (((p
)[3] & 0x3F) << 6)
147 MAYBE_UNIFY_CHAR (c
);
157 /* Translate character C by translation table TABLE. If C is
158 negative, translate a character specified by CHARSET and CODE. If
159 no translation is found in TABLE, return the untranslated
163 translate_char (table
, c
)
169 if (! CHAR_TABLE_P (table
))
171 ch
= CHAR_TABLE_REF (table
, c
);
172 if (! CHARACTERP (ch
))
177 /* Convert the unibyte character C to the corresponding multibyte
178 character based on the current value of charset_primary. If C
179 can't be converted, return C. */
182 unibyte_char_to_multibyte (c
)
185 struct charset
*charset
= CHARSET_FROM_ID (charset_primary
);
186 int c1
= DECODE_CHAR (charset
, c
);
188 return ((c1
>= 0) ? c1
: c
);
192 /* Convert the multibyte character C to unibyte 8-bit character based
193 on the current value of charset_primary. If dimension of
194 charset_primary is more than one, return (C & 0xFF).
196 The argument REV_TBL is now ignored. It will be removed in the
200 multibyte_char_to_unibyte (c
, rev_tbl
)
204 struct charset
*charset
= CHARSET_FROM_ID (charset_primary
);
205 unsigned c1
= ENCODE_CHAR (charset
, c
);
207 return ((c1
!= CHARSET_INVALID_CODE (charset
)) ? c1
: c
& 0xFF);
211 DEFUN ("characterp", Fcharacterp
, Scharacterp
, 1, 2, 0,
212 doc
: /* Return non-nil if OBJECT is a character. */)
214 Lisp_Object object
, ignore
;
216 return (CHARACTERP (object
) ? Qt
: Qnil
);
219 DEFUN ("max-char", Fmax_char
, Smax_char
, 0, 0, 0,
220 doc
: /* Return the character of the maximum code. */)
223 return make_number (MAX_CHAR
);
226 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte
,
227 Sunibyte_char_to_multibyte
, 1, 1, 0,
228 doc
: /* Convert the unibyte character CH to multibyte character.
229 The multibyte character is a result of decoding CH by
230 the current primary charset (value of `charset-primary'). */)
235 struct charset
*charset
;
237 CHECK_CHARACTER (ch
);
240 error ("Invalid unibyte character: %d", c
);
241 charset
= CHARSET_FROM_ID (charset_primary
);
242 c
= DECODE_CHAR (charset
, c
);
244 error ("Can't convert to multibyte character: %d", XINT (ch
));
245 return make_number (c
);
248 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte
,
249 Smultibyte_char_to_unibyte
, 1, 1, 0,
250 doc
: /* Convert the multibyte character CH to unibyte character.\n\
251 The unibyte character is a result of encoding CH by
252 the current primary charset (value of `charset-primary'). */)
258 struct charset
*charset
;
260 CHECK_CHARACTER (ch
);
262 charset
= CHARSET_FROM_ID (charset_primary
);
263 code
= ENCODE_CHAR (charset
, c
);
264 if (code
< CHARSET_MIN_CODE (charset
)
265 || code
> CHARSET_MAX_CODE (charset
))
266 error ("Can't convert to unibyte character: %d", XINT (ch
));
267 return make_number (code
);
270 DEFUN ("char-bytes", Fchar_bytes
, Schar_bytes
, 1, 1, 0,
271 doc
: /* Return 1 regardless of the argument CHAR.
272 This is now an obsolete function. We keep it just for backward compatibility. */)
276 CHECK_CHARACTER (ch
);
277 return make_number (1);
280 DEFUN ("char-width", Fchar_width
, Schar_width
, 1, 1, 0,
281 doc
: /* Return width of CHAR when displayed in the current buffer.
282 The width is measured by how many columns it occupies on the screen.
283 Tab is taken to occupy `tab-width' columns. */)
289 struct Lisp_Char_Table
*dp
= buffer_display_table ();
291 CHECK_CHARACTER (ch
);
294 /* Get the way the display table would display it. */
295 disp
= dp
? DISP_CHAR_VECTOR (dp
, c
) : Qnil
;
298 width
= ASIZE (disp
);
300 width
= CHAR_WIDTH (c
);
302 return make_number (width
);
305 /* Return width of string STR of length LEN when displayed in the
306 current buffer. The width is measured by how many columns it
307 occupies on the screen. */
314 return c_string_width (str
, len
, -1, NULL
, NULL
);
317 /* Return width of string STR of length LEN when displayed in the
318 current buffer. The width is measured by how many columns it
319 occupies on the screen. If PRECISION > 0, return the width of
320 longest substring that doesn't exceed PRECISION, and set number of
321 characters and bytes of the substring in *NCHARS and *NBYTES
324 c_string_width (str
, len
, precision
, nchars
, nbytes
)
326 int precision
, *nchars
, *nbytes
;
328 int i
= 0, i_byte
= 0;
330 struct Lisp_Char_Table
*dp
= buffer_display_table ();
334 int bytes
, thiswidth
;
336 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
340 val
= DISP_CHAR_VECTOR (dp
, c
);
342 thiswidth
= XVECTOR (val
)->size
;
344 thiswidth
= CHAR_WIDTH (c
);
348 thiswidth
= CHAR_WIDTH (c
);
352 && (width
+ thiswidth
> precision
))
372 /* Return width of Lisp string STRING when displayed in the current
373 buffer. The width is measured by how many columns it occupies on
374 the screen while paying attention to compositions. If PRECISION >
375 0, return the width of longest substring that doesn't exceed
376 PRECISION, and set number of characters and bytes of the substring
377 in *NCHARS and *NBYTES respectively. */
380 lisp_string_width (string
, precision
, nchars
, nbytes
)
382 int precision
, *nchars
, *nbytes
;
384 int len
= XSTRING (string
)->size
;
385 int len_byte
= STRING_BYTES (XSTRING (string
));
386 unsigned char *str
= XSTRING (string
)->data
;
387 int i
= 0, i_byte
= 0;
389 struct Lisp_Char_Table
*dp
= buffer_display_table ();
393 int chars
, bytes
, thiswidth
;
398 if (find_composition (i
, -1, &ignore
, &end
, &val
, string
)
399 && ((cmp_id
= get_composition_id (i
, i_byte
, end
- i
, val
, string
))
402 thiswidth
= composition_table
[cmp_id
]->width
;
404 bytes
= string_char_to_byte (string
, end
) - i_byte
;
408 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
411 val
= DISP_CHAR_VECTOR (dp
, c
);
413 thiswidth
= XVECTOR (val
)->size
;
415 thiswidth
= CHAR_WIDTH (c
);
419 int c
= STRING_CHAR_AND_LENGTH (str
+ i_byte
, len
- i_byte
, bytes
);
422 thiswidth
= CHAR_WIDTH (c
);
426 && (width
+ thiswidth
> precision
))
446 DEFUN ("string-width", Fstring_width
, Sstring_width
, 1, 1, 0,
447 doc
: /* Return width of STRING when displayed in the current buffer.
448 Width is measured by how many columns it occupies on the screen.
449 When calculating width of a multibyte character in STRING,
450 only the base leading-code is considered; the validity of
451 the following bytes is not checked. Tabs in STRING are always
452 taken to occupy `tab-width' columns. */)
459 XSETFASTINT (val
, lisp_string_width (str
, -1, NULL
, NULL
));
463 DEFUN ("char-direction", Fchar_direction
, Schar_direction
, 1, 1, 0,
464 doc
: /* Return the direction of CHAR.
465 The returned value is 0 for left-to-right and 1 for right-to-left. */)
471 CHECK_CHARACTER (ch
);
473 return CHAR_TABLE_REF (Vchar_direction_table
, c
);
476 DEFUN ("chars-in-region", Fchars_in_region
, Schars_in_region
, 2, 2, 0,
477 doc
: /* Return number of characters between BEG and END.
478 This is now an obsolete function. We keep it just for backward compatibility. */)
480 Lisp_Object beg
, end
;
484 CHECK_NUMBER_COERCE_MARKER (beg
);
485 CHECK_NUMBER_COERCE_MARKER (end
);
487 from
= min (XFASTINT (beg
), XFASTINT (end
));
488 to
= max (XFASTINT (beg
), XFASTINT (end
));
490 return make_number (to
- from
);
493 /* Return the number of characters in the NBYTES bytes at PTR.
494 This works by looking at the contents and checking for multibyte
495 sequences while assuming that there's no invalid sequence.
496 However, if the current buffer has enable-multibyte-characters =
497 nil, we treat each byte as a character. */
500 chars_in_text (ptr
, nbytes
)
504 /* current_buffer is null at early stages of Emacs initialization. */
505 if (current_buffer
== 0
506 || NILP (current_buffer
->enable_multibyte_characters
))
509 return multibyte_chars_in_text (ptr
, nbytes
);
512 /* Return the number of characters in the NBYTES bytes at PTR.
513 This works by looking at the contents and checking for multibyte
514 sequences while assuming that there's no invalid sequence. It
515 ignores enable-multibyte-characters. */
518 multibyte_chars_in_text (ptr
, nbytes
)
522 unsigned char *endp
= ptr
+ nbytes
;
527 int len
= MULTIBYTE_LENGTH (ptr
, endp
);
538 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
539 characters and bytes in it, and store them in *NCHARS and *NBYTES
540 respectively. On counting bytes, pay attention to that 8-bit
541 characters not constructing a valid multibyte sequence are
542 represented by 2-byte in a multibyte text. */
545 parse_str_as_multibyte (str
, len
, nchars
, nbytes
)
547 int len
, *nchars
, *nbytes
;
549 unsigned char *endp
= str
+ len
;
550 int n
, chars
= 0, bytes
= 0;
552 if (len
>= MAX_MULTIBYTE_LENGTH
)
554 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
555 while (str
< adjusted_endp
)
557 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (str
)) > 0)
558 str
+= n
, bytes
+= n
;
566 if ((n
= MULTIBYTE_LENGTH (str
, endp
)) > 0)
567 str
+= n
, bytes
+= n
;
578 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
579 It actually converts only such 8-bit characters that don't contruct
580 a multibyte sequence to multibyte forms of Latin-1 characters. If
581 NCHARS is nonzero, set *NCHARS to the number of characters in the
582 text. It is assured that we can use LEN bytes at STR as a work
583 area and that is enough. Return the number of bytes of the
587 str_as_multibyte (str
, len
, nbytes
, nchars
)
589 int len
, nbytes
, *nchars
;
591 unsigned char *p
= str
, *endp
= str
+ nbytes
;
596 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
598 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
599 while (p
< adjusted_endp
600 && (n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
603 while ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
613 safe_bcopy ((char *) p
, (char *) (endp
- nbytes
), nbytes
);
616 if (nbytes
>= MAX_MULTIBYTE_LENGTH
)
618 unsigned char *adjusted_endp
= endp
- MAX_MULTIBYTE_LENGTH
;
619 while (p
< adjusted_endp
)
621 if ((n
= MULTIBYTE_LENGTH_NO_CHECK (p
)) > 0)
629 c
= BYTE8_TO_CHAR (c
);
630 to
+= CHAR_STRING (c
, to
);
637 if ((n
= MULTIBYTE_LENGTH (p
, endp
)) > 0)
645 c
= BYTE8_TO_CHAR (c
);
646 to
+= CHAR_STRING (c
, to
);
655 /* Parse unibyte string at STR of LEN bytes, and return the number of
656 bytes it may ocupy when converted to multibyte string by
657 `str_to_multibyte'. */
660 parse_str_to_multibyte (str
, len
)
664 unsigned char *endp
= str
+ len
;
667 for (bytes
= 0; str
< endp
; str
++)
668 bytes
+= (*str
< 0x80) ? 1 : 2;
673 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
674 that contains the same single-byte characters. It actually
675 converts all 8-bit characters to multibyte forms. It is assured
676 that we can use LEN bytes at STR as a work area and that is
680 str_to_multibyte (str
, len
, bytes
)
684 unsigned char *p
= str
, *endp
= str
+ bytes
;
687 while (p
< endp
&& *p
< 0x80) p
++;
693 safe_bcopy ((char *) p
, (char *) (endp
- bytes
), bytes
);
700 c
= BYTE8_TO_CHAR (c
);
701 to
+= CHAR_STRING (c
, to
);
706 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
707 actually converts characters in the range 0x80..0xFF to
711 str_as_unibyte (str
, bytes
)
715 unsigned char *p
= str
, *endp
= str
+ bytes
;
716 unsigned char *to
= str
;
722 len
= BYTES_BY_CHAR_HEAD (c
);
723 if (CHAR_BYTE8_HEAD_P (c
))
731 len
= BYTES_BY_CHAR_HEAD (c
);
732 if (CHAR_BYTE8_HEAD_P (c
))
734 c
= STRING_CHAR_ADVANCE (p
);
735 *to
++ = CHAR_TO_BYTE8 (c
);
739 while (len
--) *to
++ = *p
++;
746 string_count_byte8 (string
)
749 int multibyte
= STRING_MULTIBYTE (string
);
750 int nchars
= XSTRING (string
)->size
;
751 int nbytes
= STRING_BYTES (XSTRING (string
));
752 unsigned char *p
= XSTRING (string
)->data
;
753 unsigned char *pend
= p
+ nbytes
;
761 len
= BYTES_BY_CHAR_HEAD (c
);
763 if (CHAR_BYTE8_HEAD_P (c
))
778 string_escape_byte8 (string
)
781 int nchars
= XSTRING (string
)->size
;
782 int nbytes
= STRING_BYTES (XSTRING (string
));
783 int multibyte
= STRING_MULTIBYTE (string
);
785 unsigned char *src
, *src_end
, *dst
;
789 if (multibyte
&& nchars
== nbytes
)
792 byte8_count
= string_count_byte8 (string
);
794 if (byte8_count
== 0)
798 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
799 val
= make_uninit_multibyte_string (nchars
+ byte8_count
* 2,
800 nbytes
+ byte8_count
* 2);
802 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
803 val
= make_uninit_string (nbytes
+ byte8_count
* 3);
805 src
= XSTRING (string
)->data
;
806 src_end
= src
+ nbytes
;
807 dst
= XSTRING (val
)->data
;
809 while (src
< src_end
)
812 len
= BYTES_BY_CHAR_HEAD (c
);
814 if (CHAR_BYTE8_HEAD_P (c
))
816 c
= STRING_CHAR_ADVANCE (src
);
817 c
= CHAR_TO_BYTE8 (c
);
818 sprintf (dst
, "\\%03o", c
);
822 while (len
--) *dst
++ = *src
++;
825 while (src
< src_end
)
830 sprintf (dst
, "\\%03o", c
);
840 DEFUN ("string", Fstring
, Sstring
, 1, MANY
, 0,
842 Concatenate all the argument characters and make the result a string. */)
848 unsigned char *buf
= (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH
* n
);
849 unsigned char *p
= buf
;
852 for (i
= 0; i
< n
; i
++)
854 CHECK_CHARACTER (args
[i
]);
856 p
+= CHAR_STRING (c
, p
);
859 return make_string_from_bytes ((char *) buf
, n
, p
- buf
);
863 init_character_once ()
872 DEFSYM (Qcharacterp
, "characterp");
873 DEFSYM (Qauto_fill_chars
, "auto-fill-chars");
875 staticpro (&Vchar_unify_table
);
876 Vchar_unify_table
= Qnil
;
878 defsubr (&Smax_char
);
879 defsubr (&Scharacterp
);
880 defsubr (&Sunibyte_char_to_multibyte
);
881 defsubr (&Smultibyte_char_to_unibyte
);
882 defsubr (&Schar_bytes
);
883 defsubr (&Schar_width
);
884 defsubr (&Sstring_width
);
885 defsubr (&Schar_direction
);
886 defsubr (&Schars_in_region
);
889 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector
,
891 Vector of cons cell of a symbol and translation table ever defined.
892 An ID of a translation table is an index of this vector. */);
893 Vtranslation_table_vector
= Fmake_vector (make_number (16), Qnil
);
895 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars
,
897 A char-table for characters which invoke auto-filling.
898 Such characters have value t in this table. */);
899 Vauto_fill_chars
= Fmake_char_table (Qauto_fill_chars
, Qnil
);
900 CHAR_TABLE_SET (Vauto_fill_chars
, make_number (' '), Qt
);
901 CHAR_TABLE_SET (Vauto_fill_chars
, make_number ('\n'), Qt
);
903 DEFVAR_LISP ("char-width-table", &Vchar_width_table
,
905 A char-table for width (columns) of each character. */);
906 Vchar_width_table
= Fmake_char_table (Qnil
, make_number (1));
908 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table
,
909 doc
: /* A char-table for direction of each character. */);
910 Vchar_direction_table
= Fmake_char_table (Qnil
, make_number (1));
912 DEFVAR_LISP ("printable-chars", &Vprintable_chars
,
913 doc
: /* A char-table for each printable character. */);
914 Vprintable_chars
= Fmake_char_table (Qnil
, Qt
);