(Custom-mode): Set up tool-bar-map unconditionally.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
ec62e0ac
GM
4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 Free Software Foundation, Inc.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
0168c3d8
KH
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
37#include "lisp.h"
38#include "character.h"
39#include "buffer.h"
40#include "charset.h"
41#include "composite.h"
42#include "disptab.h"
43
44#else /* not emacs */
45
46#include "mulelib.h"
47
48#endif /* emacs */
49
50Lisp_Object Qcharacterp;
51
52/* Vector of translation table ever defined.
53 ID of a translation table is used to index this vector. */
54Lisp_Object Vtranslation_table_vector;
55
56/* A char-table for characters which may invoke auto-filling. */
57Lisp_Object Vauto_fill_chars;
58
59Lisp_Object Qauto_fill_chars;
60
33f91981
KH
61/* Char-table of information about which character to unify to which
62 Unicode character. */
0168c3d8
KH
63Lisp_Object Vchar_unify_table;
64
65/* A char-table. An element is non-nil iff the corresponding
66 character has a printable glyph. */
67Lisp_Object Vprintable_chars;
68
69/* A char-table. An elemnent is a column-width of the corresponding
70 character. */
71Lisp_Object Vchar_width_table;
72
73/* A char-table. An element is a symbol indicating the direction
74 property of corresponding character. */
75Lisp_Object Vchar_direction_table;
76
8973478b 77/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 78unsigned char *_fetch_multibyte_char_p;
0168c3d8 79
c57f3328
KH
80/* Char table of scripts. */
81Lisp_Object Vchar_script_table;
82
c7e14352
KH
83/* Alist of scripts vs representative characters. */
84Lisp_Object Vscript_representative_chars;
85
c57f3328
KH
86static Lisp_Object Qchar_script_table;
87
a3cbb631
KH
88Lisp_Object Vunicode_category_table;
89
b672c5ae
KH
90/* Mapping table from unibyte chars to multibyte chars. */
91int unibyte_to_multibyte_table[256];
15843e6f 92
90c9d035
KH
93/* Nth element is 1 iff unibyte char N can be mapped to a multibyte
94 char. */
95char unibyte_has_multibyte_table[256];
96
0168c3d8
KH
97\f
98
2bde7652
KH
99/* If character code C has modifier masks, reflect them to the
100 character code if possible. Return the resulting code. */
101
102int
103char_resolve_modifier_mask (c)
104 int c;
105{
d0363d44 106 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
107 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
108 return c;
109
110 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
111 if (c & CHAR_SHIFT)
112 {
113 /* Shift modifier is valid only with [A-Za-z]. */
114 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
115 c &= ~CHAR_SHIFT;
116 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
117 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
118 /* Shift modifier for control characters and SPC is ignored. */
119 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
120 c &= ~CHAR_SHIFT;
121 }
2bde7652
KH
122 if (c & CHAR_CTL)
123 {
124 /* Simulate the code in lread.c. */
125 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
126 if ((c & 0377) == ' ')
127 c &= ~0177 & ~ CHAR_CTL;
128 else if ((c & 0377) == '?')
129 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
130 /* ASCII control chars are made from letters (both cases),
131 as well as the non-letters within 0100...0137. */
132 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
133 c &= (037 | (~0177 & ~CHAR_CTL));
134 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
135 c &= (037 | (~0177 & ~CHAR_CTL));
136 }
03365d0e
KH
137 if (c & CHAR_META)
138 {
139 /* Move the meta bit to the right place for a string. */
140 c = (c & ~CHAR_META) | 0x80;
141 }
2bde7652
KH
142
143 return c;
144}
145
146
33f91981
KH
147/* Store multibyte form of character C at P. If C has modifier bits,
148 handle them appropriately. */
149
0168c3d8 150int
e3d8eb8c 151char_string (c, p)
5aa91c9b 152 unsigned c;
1889b238 153 unsigned char *p;
0168c3d8
KH
154{
155 int bytes;
156
e3d8eb8c
KH
157 if (c & CHAR_MODIFIER_MASK)
158 {
2bde7652 159 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
160 /* If C still has any modifier bits, just ignore it. */
161 c &= ~CHAR_MODIFIER_MASK;
162 }
163
0168c3d8
KH
164 MAYBE_UNIFY_CHAR (c);
165
e3d8eb8c 166 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
167 {
168 bytes = CHAR_STRING (c, p);
169 }
170 else if (c <= MAX_4_BYTE_CHAR)
171 {
172 p[0] = (0xF0 | (c >> 18));
173 p[1] = (0x80 | ((c >> 12) & 0x3F));
174 p[2] = (0x80 | ((c >> 6) & 0x3F));
175 p[3] = (0x80 | (c & 0x3F));
176 bytes = 4;
177 }
e3d8eb8c 178 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
179 {
180 p[0] = 0xF8;
181 p[1] = (0x80 | ((c >> 18) & 0x0F));
182 p[2] = (0x80 | ((c >> 12) & 0x3F));
183 p[3] = (0x80 | ((c >> 6) & 0x3F));
184 p[4] = (0x80 | (c & 0x3F));
185 bytes = 5;
186 }
5aa91c9b 187 else if (c <= MAX_CHAR)
e3d8eb8c
KH
188 {
189 c = CHAR_TO_BYTE8 (c);
190 bytes = BYTE8_STRING (c, p);
191 }
5aa91c9b
KH
192 else
193 error ("Invalid character: %d", c);
1889b238 194
0168c3d8
KH
195 return bytes;
196}
197
198
33f91981
KH
199/* Return a character whose multibyte form is at P. Set LEN is not
200 NULL, it must be a pointer to integer. In that case, set *LEN to
201 the byte length of the multibyte form. If ADVANCED is not NULL, is
202 must be a pointer to unsigned char. In that case, set *ADVANCED to
203 the ending address (i.e. the starting address of the next
204 character) of the multibyte form. */
205
0168c3d8 206int
e3d8eb8c 207string_char (p, advanced, len)
15843e6f
KH
208 const unsigned char *p;
209 const unsigned char **advanced;
0168c3d8
KH
210 int *len;
211{
1889b238 212 int c;
15843e6f 213 const unsigned char *saved_p = p;
0168c3d8
KH
214
215 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
216 {
217 c = STRING_CHAR_ADVANCE (p);
218 }
219 else if (! (*p & 0x08))
220 {
221 c = ((((p)[0] & 0xF) << 18)
222 | (((p)[1] & 0x3F) << 12)
223 | (((p)[2] & 0x3F) << 6)
224 | ((p)[3] & 0x3F));
225 p += 4;
226 }
227 else
228 {
229 c = ((((p)[1] & 0x3F) << 18)
230 | (((p)[2] & 0x3F) << 12)
231 | (((p)[3] & 0x3F) << 6)
232 | ((p)[4] & 0x3F));
233 p += 5;
234 }
235
236 MAYBE_UNIFY_CHAR (c);
237
238 if (len)
239 *len = p - saved_p;
240 if (advanced)
241 *advanced = p;
242 return c;
243}
244
245
246/* Translate character C by translation table TABLE. If C is
247 negative, translate a character specified by CHARSET and CODE. If
248 no translation is found in TABLE, return the untranslated
10453be9
KH
249 character. If TABLE is a list, elements are char tables. In this
250 case, translace C by all tables. */
0168c3d8
KH
251
252int
253translate_char (table, c)
254 Lisp_Object table;
255 int c;
256{
10453be9
KH
257 if (CHAR_TABLE_P (table))
258 {
259 Lisp_Object ch;
260
261 ch = CHAR_TABLE_REF (table, c);
262 if (CHARACTERP (ch))
263 c = XINT (ch);
264 }
265 else
266 {
267 for (; CONSP (table); table = XCDR (table))
268 c = translate_char (XCAR (table), c);
269 }
270 return c;
0168c3d8
KH
271}
272
0168c3d8 273/* Convert the multibyte character C to unibyte 8-bit character based
ac86488b
KH
274 on the current value of charset_unibyte. If dimension of
275 charset_unibyte is more than one, return (C & 0xFF).
0168c3d8
KH
276
277 The argument REV_TBL is now ignored. It will be removed in the
278 future. */
279
280int
281multibyte_char_to_unibyte (c, rev_tbl)
282 int c;
283 Lisp_Object rev_tbl;
284{
b672c5ae
KH
285 struct charset *charset;
286 unsigned c1;
0168c3d8 287
b672c5ae
KH
288 if (CHAR_BYTE8_P (c))
289 return CHAR_TO_BYTE8 (c);
290 charset = CHARSET_FROM_ID (charset_unibyte);
291 c1 = ENCODE_CHAR (charset, c);
0168c3d8
KH
292 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
293}
294
935d5b02
KH
295/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
296 by charset_unibyte. */
297
298int
299multibyte_char_to_unibyte_safe (c)
300 int c;
301{
302 struct charset *charset;
303 unsigned c1;
304
305 if (CHAR_BYTE8_P (c))
306 return CHAR_TO_BYTE8 (c);
307 charset = CHARSET_FROM_ID (charset_unibyte);
308 c1 = ENCODE_CHAR (charset, c);
309 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
310}
0168c3d8
KH
311
312DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
313 doc: /* Return non-nil if OBJECT is a character. */)
314 (object, ignore)
315 Lisp_Object object, ignore;
316{
317 return (CHARACTERP (object) ? Qt : Qnil);
318}
319
320DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
321 doc: /* Return the character of the maximum code. */)
322 ()
323{
324 return make_number (MAX_CHAR);
325}
326
327DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
328 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 329 doc: /* Convert the byte CH to multibyte character. */)
0168c3d8
KH
330 (ch)
331 Lisp_Object ch;
332{
333 int c;
334 struct charset *charset;
335
336 CHECK_CHARACTER (ch);
337 c = XFASTINT (ch);
338 if (c >= 0400)
339 error ("Invalid unibyte character: %d", c);
ac86488b 340 charset = CHARSET_FROM_ID (charset_unibyte);
0168c3d8
KH
341 c = DECODE_CHAR (charset, c);
342 if (c < 0)
3c5a53bd 343 c = BYTE8_TO_CHAR (XFASTINT (ch));
0168c3d8
KH
344 return make_number (c);
345}
346
347DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
348 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
349 doc: /* Convert the multibyte character CH to a byte.
350If the multibyte character does not represent a byte, return -1. */)
0168c3d8
KH
351 (ch)
352 Lisp_Object ch;
353{
5556875b 354 int cm;
0168c3d8
KH
355
356 CHECK_CHARACTER (ch);
5556875b
SM
357 cm = XFASTINT (ch);
358 if (cm < 256)
359 /* Can't distinguish a byte read from a unibyte buffer from
360 a latin1 char, so let's let it slide. */
361 return ch;
362 else
363 {
2afc21f5 364 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
365 return make_number (cu);
366 }
0168c3d8
KH
367}
368
369DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
370 doc: /* Return 1 regardless of the argument CHAR.
add553ac
JB
371This is now an obsolete function. We keep it just for backward compatibility.
372usage: (char-bytes CHAR) */)
0168c3d8
KH
373 (ch)
374 Lisp_Object ch;
375{
376 CHECK_CHARACTER (ch);
377 return make_number (1);
378}
379
380DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
381 doc: /* Return width of CHAR when displayed in the current buffer.
382The width is measured by how many columns it occupies on the screen.
add553ac
JB
383Tab is taken to occupy `tab-width' columns.
384usage: (char-width CHAR) */)
0168c3d8
KH
385 (ch)
386 Lisp_Object ch;
387{
388 Lisp_Object disp;
389 int c, width;
390 struct Lisp_Char_Table *dp = buffer_display_table ();
391
392 CHECK_CHARACTER (ch);
393 c = XINT (ch);
394
395 /* Get the way the display table would display it. */
396 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
397
398 if (VECTORP (disp))
399 width = ASIZE (disp);
400 else
401 width = CHAR_WIDTH (c);
402
403 return make_number (width);
404}
405
0168c3d8
KH
406/* Return width of string STR of length LEN when displayed in the
407 current buffer. The width is measured by how many columns it
408 occupies on the screen. If PRECISION > 0, return the width of
409 longest substring that doesn't exceed PRECISION, and set number of
410 characters and bytes of the substring in *NCHARS and *NBYTES
411 respectively. */
412
1889b238 413int
0168c3d8 414c_string_width (str, len, precision, nchars, nbytes)
8f924df7 415 const unsigned char *str;
0168c3d8
KH
416 int precision, *nchars, *nbytes;
417{
418 int i = 0, i_byte = 0;
419 int width = 0;
420 struct Lisp_Char_Table *dp = buffer_display_table ();
421
422 while (i_byte < len)
423 {
424 int bytes, thiswidth;
425 Lisp_Object val;
426 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
427
428 if (dp)
429 {
430 val = DISP_CHAR_VECTOR (dp, c);
431 if (VECTORP (val))
432 thiswidth = XVECTOR (val)->size;
433 else
434 thiswidth = CHAR_WIDTH (c);
435 }
436 else
437 {
438 thiswidth = CHAR_WIDTH (c);
439 }
440
441 if (precision > 0
442 && (width + thiswidth > precision))
443 {
444 *nchars = i;
445 *nbytes = i_byte;
446 return width;
447 }
448 i++;
449 i_byte += bytes;
450 width += thiswidth;
451 }
452
453 if (precision > 0)
454 {
455 *nchars = i;
456 *nbytes = i_byte;
457 }
458
459 return width;
460}
461
1889b238
KH
462/* Return width of string STR of length LEN when displayed in the
463 current buffer. The width is measured by how many columns it
464 occupies on the screen. */
465
466int
467strwidth (str, len)
468 unsigned char *str;
469 int len;
470{
471 return c_string_width (str, len, -1, NULL, NULL);
472}
473
0168c3d8
KH
474/* Return width of Lisp string STRING when displayed in the current
475 buffer. The width is measured by how many columns it occupies on
476 the screen while paying attention to compositions. If PRECISION >
477 0, return the width of longest substring that doesn't exceed
478 PRECISION, and set number of characters and bytes of the substring
479 in *NCHARS and *NBYTES respectively. */
480
481int
482lisp_string_width (string, precision, nchars, nbytes)
483 Lisp_Object string;
484 int precision, *nchars, *nbytes;
485{
8f924df7 486 int len = SCHARS (string);
0aee65b9
KH
487 /* This set multibyte to 0 even if STRING is multibyte when it
488 contains only ascii and eight-bit-graphic, but that's
489 intentional. */
490 int multibyte = len < SBYTES (string);
8f924df7 491 unsigned char *str = SDATA (string);
0168c3d8
KH
492 int i = 0, i_byte = 0;
493 int width = 0;
494 struct Lisp_Char_Table *dp = buffer_display_table ();
495
496 while (i < len)
497 {
498 int chars, bytes, thiswidth;
499 Lisp_Object val;
500 int cmp_id;
f4bc0685 501 EMACS_INT ignore, end;
0168c3d8
KH
502
503 if (find_composition (i, -1, &ignore, &end, &val, string)
504 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
505 >= 0))
506 {
507 thiswidth = composition_table[cmp_id]->width;
508 chars = end - i;
509 bytes = string_char_to_byte (string, end) - i_byte;
510 }
0168c3d8
KH
511 else
512 {
0aee65b9 513 int c;
0168c3d8 514
0aee65b9
KH
515 if (multibyte)
516 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
517 else
518 c = str[i_byte], bytes = 1;
0168c3d8 519 chars = 1;
0aee65b9
KH
520 if (dp)
521 {
522 val = DISP_CHAR_VECTOR (dp, c);
523 if (VECTORP (val))
524 thiswidth = XVECTOR (val)->size;
525 else
526 thiswidth = CHAR_WIDTH (c);
527 }
528 else
529 {
530 thiswidth = CHAR_WIDTH (c);
531 }
0168c3d8
KH
532 }
533
534 if (precision > 0
535 && (width + thiswidth > precision))
536 {
537 *nchars = i;
538 *nbytes = i_byte;
539 return width;
540 }
541 i += chars;
542 i_byte += bytes;
543 width += thiswidth;
544 }
545
546 if (precision > 0)
547 {
548 *nchars = i;
549 *nbytes = i_byte;
550 }
551
552 return width;
553}
554
555DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
556 doc: /* Return width of STRING when displayed in the current buffer.
557Width is measured by how many columns it occupies on the screen.
558When calculating width of a multibyte character in STRING,
559only the base leading-code is considered; the validity of
560the following bytes is not checked. Tabs in STRING are always
add553ac
JB
561taken to occupy `tab-width' columns.
562usage: (string-width STRING) */)
0168c3d8
KH
563 (str)
564 Lisp_Object str;
565{
566 Lisp_Object val;
567
568 CHECK_STRING (str);
569 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
570 return val;
571}
572
573DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
574 doc: /* Return the direction of CHAR.
add553ac
JB
575The returned value is 0 for left-to-right and 1 for right-to-left.
576usage: (char-direction CHAR) */)
0168c3d8
KH
577 (ch)
578 Lisp_Object ch;
579{
580 int c;
581
582 CHECK_CHARACTER (ch);
583 c = XINT (ch);
584 return CHAR_TABLE_REF (Vchar_direction_table, c);
585}
586
0168c3d8
KH
587/* Return the number of characters in the NBYTES bytes at PTR.
588 This works by looking at the contents and checking for multibyte
589 sequences while assuming that there's no invalid sequence.
590 However, if the current buffer has enable-multibyte-characters =
591 nil, we treat each byte as a character. */
592
13818c30 593EMACS_INT
0168c3d8 594chars_in_text (ptr, nbytes)
8f924df7 595 const unsigned char *ptr;
13818c30 596 EMACS_INT nbytes;
0168c3d8
KH
597{
598 /* current_buffer is null at early stages of Emacs initialization. */
599 if (current_buffer == 0
600 || NILP (current_buffer->enable_multibyte_characters))
601 return nbytes;
602
603 return multibyte_chars_in_text (ptr, nbytes);
604}
605
606/* Return the number of characters in the NBYTES bytes at PTR.
607 This works by looking at the contents and checking for multibyte
608 sequences while assuming that there's no invalid sequence. It
609 ignores enable-multibyte-characters. */
610
13818c30 611EMACS_INT
0168c3d8 612multibyte_chars_in_text (ptr, nbytes)
8f924df7 613 const unsigned char *ptr;
13818c30 614 EMACS_INT nbytes;
0168c3d8 615{
8f924df7 616 const unsigned char *endp = ptr + nbytes;
0168c3d8
KH
617 int chars = 0;
618
619 while (ptr < endp)
620 {
621 int len = MULTIBYTE_LENGTH (ptr, endp);
622
623 if (len == 0)
624 abort ();
625 ptr += len;
626 chars++;
627 }
628
629 return chars;
630}
631
632/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
633 characters and bytes in it, and store them in *NCHARS and *NBYTES
634 respectively. On counting bytes, pay attention to that 8-bit
635 characters not constructing a valid multibyte sequence are
636 represented by 2-byte in a multibyte text. */
637
638void
639parse_str_as_multibyte (str, len, nchars, nbytes)
8f924df7 640 const unsigned char *str;
0168c3d8
KH
641 int len, *nchars, *nbytes;
642{
8f924df7 643 const unsigned char *endp = str + len;
0168c3d8
KH
644 int n, chars = 0, bytes = 0;
645
646 if (len >= MAX_MULTIBYTE_LENGTH)
647 {
8f924df7 648 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
649 while (str < adjusted_endp)
650 {
651 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
652 str += n, bytes += n;
653 else
654 str++, bytes += 2;
655 chars++;
656 }
657 }
658 while (str < endp)
659 {
660 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
661 str += n, bytes += n;
662 else
663 str++, bytes += 2;
664 chars++;
665 }
666
667 *nchars = chars;
668 *nbytes = bytes;
669 return;
670}
671
672/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
673 It actually converts only such 8-bit characters that don't contruct
674 a multibyte sequence to multibyte forms of Latin-1 characters. If
675 NCHARS is nonzero, set *NCHARS to the number of characters in the
676 text. It is assured that we can use LEN bytes at STR as a work
677 area and that is enough. Return the number of bytes of the
678 resulting text. */
679
680int
681str_as_multibyte (str, len, nbytes, nchars)
682 unsigned char *str;
683 int len, nbytes, *nchars;
684{
685 unsigned char *p = str, *endp = str + nbytes;
686 unsigned char *to;
687 int chars = 0;
688 int n;
689
690 if (nbytes >= MAX_MULTIBYTE_LENGTH)
691 {
692 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
693 while (p < adjusted_endp
694 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
695 p += n, chars++;
696 }
697 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
698 p += n, chars++;
699 if (nchars)
700 *nchars = chars;
701 if (p == endp)
702 return nbytes;
703
704 to = p;
705 nbytes = endp - p;
706 endp = str + len;
707 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
708 p = endp - nbytes;
709
710 if (nbytes >= MAX_MULTIBYTE_LENGTH)
711 {
712 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
713 while (p < adjusted_endp)
714 {
715 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
716 {
717 while (n--)
718 *to++ = *p++;
719 }
720 else
721 {
722 int c = *p++;
723 c = BYTE8_TO_CHAR (c);
724 to += CHAR_STRING (c, to);
725 }
726 }
727 chars++;
728 }
729 while (p < endp)
730 {
731 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
732 {
733 while (n--)
734 *to++ = *p++;
8f924df7 735 }
0168c3d8
KH
736 else
737 {
738 int c = *p++;
739 c = BYTE8_TO_CHAR (c);
740 to += CHAR_STRING (c, to);
741 }
742 chars++;
743 }
744 if (nchars)
745 *nchars = chars;
746 return (to - str);
747}
748
749/* Parse unibyte string at STR of LEN bytes, and return the number of
750 bytes it may ocupy when converted to multibyte string by
751 `str_to_multibyte'. */
752
753int
754parse_str_to_multibyte (str, len)
755 unsigned char *str;
756 int len;
757{
758 unsigned char *endp = str + len;
759 int bytes;
760
761 for (bytes = 0; str < endp; str++)
762 bytes += (*str < 0x80) ? 1 : 2;
763 return bytes;
764}
765
766
767/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
768 that contains the same single-byte characters. It actually
769 converts all 8-bit characters to multibyte forms. It is assured
770 that we can use LEN bytes at STR as a work area and that is
771 enough. */
772
773int
774str_to_multibyte (str, len, bytes)
775 unsigned char *str;
776 int len, bytes;
777{
778 unsigned char *p = str, *endp = str + bytes;
779 unsigned char *to;
780
781 while (p < endp && *p < 0x80) p++;
782 if (p == endp)
783 return bytes;
784 to = p;
785 bytes = endp - p;
786 endp = str + len;
787 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
788 p = endp - bytes;
8f924df7 789 while (p < endp)
0168c3d8
KH
790 {
791 int c = *p++;
792
793 if (c >= 0x80)
794 c = BYTE8_TO_CHAR (c);
795 to += CHAR_STRING (c, to);
796 }
797 return (to - str);
798}
799
800/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
801 actually converts characters in the range 0x80..0xFF to
802 unibyte. */
803
804int
805str_as_unibyte (str, bytes)
806 unsigned char *str;
807 int bytes;
808{
15843e6f
KH
809 const unsigned char *p = str, *endp = str + bytes;
810 unsigned char *to;
0168c3d8
KH
811 int c, len;
812
813 while (p < endp)
814 {
815 c = *p;
816 len = BYTES_BY_CHAR_HEAD (c);
817 if (CHAR_BYTE8_HEAD_P (c))
818 break;
819 p += len;
820 }
15843e6f 821 to = str + (p - str);
8f924df7 822 while (p < endp)
0168c3d8
KH
823 {
824 c = *p;
825 len = BYTES_BY_CHAR_HEAD (c);
826 if (CHAR_BYTE8_HEAD_P (c))
827 {
828 c = STRING_CHAR_ADVANCE (p);
829 *to++ = CHAR_TO_BYTE8 (c);
830 }
831 else
832 {
833 while (len--) *to++ = *p++;
834 }
835 }
836 return (to - str);
837}
838
4aa40bb8
KH
839/* Convert eight-bit chars in SRC (in multibyte form) to the
840 corresponding byte and store in DST. CHARS is the number of
841 characters in SRC. The value is the number of bytes stored in DST.
842 Usually, the value is the same as CHARS, but is less than it if SRC
843 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
844 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
845 of that character code.
846 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
847
848EMACS_INT
849str_to_unibyte (src, dst, chars, accept_latin_1)
850 const unsigned char *src;
851 unsigned char *dst;
852 EMACS_INT chars;
853 int accept_latin_1;
854{
855 EMACS_INT i;
856
857 for (i = 0; i < chars; i++)
858 {
859 int c = STRING_CHAR_ADVANCE (src);
860
861 if (CHAR_BYTE8_P (c))
862 c = CHAR_TO_BYTE8 (c);
863 else if (! ASCII_CHAR_P (c)
864 && (! accept_latin_1 || c >= 0x100))
865 return i;
866 *dst++ = c;
867 }
868 return i;
869}
870
871
0168c3d8
KH
872int
873string_count_byte8 (string)
874 Lisp_Object string;
875{
876 int multibyte = STRING_MULTIBYTE (string);
8f924df7
KH
877 int nbytes = SBYTES (string);
878 unsigned char *p = SDATA (string);
0168c3d8
KH
879 unsigned char *pend = p + nbytes;
880 int count = 0;
881 int c, len;
882
883 if (multibyte)
884 while (p < pend)
885 {
886 c = *p;
887 len = BYTES_BY_CHAR_HEAD (c);
888
889 if (CHAR_BYTE8_HEAD_P (c))
890 count++;
891 p += len;
892 }
893 else
894 while (p < pend)
895 {
896 if (*p++ >= 0x80)
897 count++;
898 }
899 return count;
900}
901
902
903Lisp_Object
904string_escape_byte8 (string)
905 Lisp_Object string;
906{
8f924df7
KH
907 int nchars = SCHARS (string);
908 int nbytes = SBYTES (string);
0168c3d8
KH
909 int multibyte = STRING_MULTIBYTE (string);
910 int byte8_count;
15843e6f
KH
911 const unsigned char *src, *src_end;
912 unsigned char *dst;
0168c3d8
KH
913 Lisp_Object val;
914 int c, len;
915
916 if (multibyte && nchars == nbytes)
917 return string;
918
919 byte8_count = string_count_byte8 (string);
920
921 if (byte8_count == 0)
922 return string;
923
924 if (multibyte)
925 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
7b40ebaf 926 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
0168c3d8
KH
927 nbytes + byte8_count * 2);
928 else
929 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
930 val = make_uninit_string (nbytes + byte8_count * 3);
931
8f924df7 932 src = SDATA (string);
0168c3d8 933 src_end = src + nbytes;
8f924df7 934 dst = SDATA (val);
0168c3d8
KH
935 if (multibyte)
936 while (src < src_end)
937 {
938 c = *src;
939 len = BYTES_BY_CHAR_HEAD (c);
940
941 if (CHAR_BYTE8_HEAD_P (c))
942 {
943 c = STRING_CHAR_ADVANCE (src);
944 c = CHAR_TO_BYTE8 (c);
1889b238 945 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
946 dst += 4;
947 }
948 else
949 while (len--) *dst++ = *src++;
950 }
951 else
952 while (src < src_end)
953 {
954 c = *src++;
955 if (c >= 0x80)
956 {
1889b238 957 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
958 dst += 4;
959 }
960 else
961 *dst++ = c;
962 }
963 return val;
964}
965
966\f
8f924df7 967DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 968 doc: /*
d2e83296
DL
969Concatenate all the argument characters and make the result a string.
970usage: (string &rest CHARACTERS) */)
0168c3d8
KH
971 (n, args)
972 int n;
973 Lisp_Object *args;
974{
975 int i;
976 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
977 unsigned char *p = buf;
978 int c;
979
980 for (i = 0; i < n; i++)
981 {
982 CHECK_CHARACTER (args[i]);
983 c = XINT (args[i]);
984 p += CHAR_STRING (c, p);
985 }
986
987 return make_string_from_bytes ((char *) buf, n, p - buf);
988}
989
70b4969d 990DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
991 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
992usage: (unibyte-string &rest BYTES) */)
70b4969d
KH
993 (n, args)
994 int n;
995 Lisp_Object *args;
996{
997 int i;
998 unsigned char *buf = (unsigned char *) alloca (n);
999 unsigned char *p = buf;
1000 unsigned c;
1001
1002 for (i = 0; i < n; i++)
1003 {
1004 CHECK_NATNUM (args[i]);
1005 c = XFASTINT (args[i]);
1006 if (c >= 256)
1007 args_out_of_range_3 (args[i], make_number (0), make_number (255));
1008 *p++ = c;
1009 }
1010
1011 return make_string_from_bytes ((char *) buf, n, p - buf);
1012}
1013
c73ae4ae 1014DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
1015 Schar_resolve_modifiers, 1, 1, 0,
1016 doc: /* Resolve modifiers in the character CHAR.
1017The value is a character with modifiers resolved into the character
1018code. Unresolved modifiers are kept in the value.
c73ae4ae 1019usage: (char-resolve-modifiers CHAR) */)
d0363d44
KH
1020 (character)
1021 Lisp_Object character;
1022{
1023 int c;
1024
1025 CHECK_NUMBER (character);
1026 c = XINT (character);
1027 return make_number (char_resolve_modifier_mask (c));
1028}
1029
ee107a89
KH
1030DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1031 doc: /* Return a byte value of a character at point.
1032Optional 1st arg POSITION, if non-nil, is a position of a character to get
1033a byte value.
1034Optional 2nd arg STRING, if non-nil, is a string of which first
1035character is a target to get a byte value. In this case, POSITION, if
1036non-nil, is an index of a target character in the string.
1037
1038If the current buffer (or STRING) is multibyte, and the target
1039character is not ASCII nor 8-bit character, an error is signalled. */)
1040 (position, string)
1041 Lisp_Object position, string;
1042{
1043 int c;
1044 EMACS_INT pos;
1045 unsigned char *p;
1046
1047 if (NILP (string))
1048 {
1049 if (NILP (position))
1050 {
1051 p = PT_ADDR;
1052 }
1053 else
1054 {
1055 CHECK_NUMBER_COERCE_MARKER (position);
1056 if (XINT (position) < BEGV || XINT (position) >= ZV)
1057 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1058 pos = XFASTINT (position);
1059 p = CHAR_POS_ADDR (pos);
1060 }
d5998e03
KH
1061 if (NILP (current_buffer->enable_multibyte_characters))
1062 return make_number (*p);
ee107a89
KH
1063 }
1064 else
1065 {
1066 CHECK_STRING (string);
1067 if (NILP (position))
1068 {
1069 p = SDATA (string);
1070 }
1071 else
1072 {
1073 CHECK_NATNUM (position);
1074 if (XINT (position) >= SCHARS (string))
1075 args_out_of_range (string, position);
1076 pos = XFASTINT (position);
1077 p = SDATA (string) + string_char_to_byte (string, pos);
1078 }
d5998e03
KH
1079 if (! STRING_MULTIBYTE (string))
1080 return make_number (*p);
ee107a89
KH
1081 }
1082 c = STRING_CHAR (p, 0);
1083 if (CHAR_BYTE8_P (c))
1084 c = CHAR_TO_BYTE8 (c);
1085 else if (! ASCII_CHAR_P (c))
1086 error ("Not an ASCII nor an 8-bit character: %d", c);
1087 return make_number (c);
1088}
1089
1090
0168c3d8
KH
1091void
1092init_character_once ()
1093{
1094}
1095
1096#ifdef emacs
1097
1098void
1099syms_of_character ()
1100{
1101 DEFSYM (Qcharacterp, "characterp");
1102 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1103
1104 staticpro (&Vchar_unify_table);
1105 Vchar_unify_table = Qnil;
1106
1107 defsubr (&Smax_char);
1108 defsubr (&Scharacterp);
1109 defsubr (&Sunibyte_char_to_multibyte);
1110 defsubr (&Smultibyte_char_to_unibyte);
1111 defsubr (&Schar_bytes);
1112 defsubr (&Schar_width);
1113 defsubr (&Sstring_width);
1114 defsubr (&Schar_direction);
0168c3d8 1115 defsubr (&Sstring);
70b4969d 1116 defsubr (&Sunibyte_string);
d0363d44 1117 defsubr (&Schar_resolve_modifiers);
ee107a89 1118 defsubr (&Sget_byte);
0168c3d8
KH
1119
1120 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1121 doc: /*
68978cf0
DL
1122Vector recording all translation tables ever defined.
1123Each element is a pair (SYMBOL . TABLE) relating the table to the
1124symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1125 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1126
1127 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1128 doc: /*
1129A char-table for characters which invoke auto-filling.
1130Such characters have value t in this table. */);
1131 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1132 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1133 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
1134
1135 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1136 doc: /*
1137A char-table for width (columns) of each character. */);
1138 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1139 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1140 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1141 make_number (4));
0168c3d8
KH
1142
1143 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1144 doc: /* A char-table for direction of each character. */);
1145 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1146
1147 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1148 doc: /* A char-table for each printable character. */);
db6d4189 1149 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1150 Fset_char_table_range (Vprintable_chars,
1151 Fcons (make_number (32), make_number (126)), Qt);
1152 Fset_char_table_range (Vprintable_chars,
1153 Fcons (make_number (160),
1154 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1155
c57f3328
KH
1156 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1157 doc: /* Char table of script symbols.
1158It has one extra slot whose value is a list of script symbols. */);
1159
1160 /* Intern this now in case it isn't already done.
1161 Setting this variable twice is harmless.
1162 But don't staticpro it here--that is done in alloc.c. */
1163 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1164 DEFSYM (Qchar_script_table, "char-script-table");
1165 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1166 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352
KH
1167
1168 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
c3bb7671
KH
1169 doc: /* Alist of scripts vs the representative characters.
1170Each element is a cons (SCRIPT . CHARS), where SCRIPT is a script name symbol,
1171CHARS is a list or a vector of characters.
472a4dc9 1172If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1173If it is a vector, one of the characters in the vector is necessary.
1174This variable is used to find a font for a specific script. */);
c7e14352 1175 Vscript_representative_chars = Qnil;
a3cbb631
KH
1176
1177 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1178 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1179All Unicode characters have one of the following values (symbol):
1180 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1181 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1182See The Unicode Standard for the meaning of those values. */);
1183 /* The correct char-table is setup in characters.el. */
1184 Vunicode_category_table = Qnil;
0168c3d8
KH
1185}
1186
1187#endif /* emacs */
fbaf0946
MB
1188
1189/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1190 (do not change this comment) */