* coding.c (MAX_CHARBUF_SIZE): Renamed from CHARBUF_SIZE.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8 1/* Basic character support.
73b0cd50 2
ba318903 3Copyright (C) 2001-2014 Free Software Foundation, Inc.
73b0cd50
GM
4Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
0168c3d8
KH
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
2b4560a8 37#include <intprops.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
955cbe7b 53static Lisp_Object Qauto_fill_chars;
0168c3d8 54
33f91981 55/* Char-table of information about which character to unify to which
6f1b43a0 56 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
57Lisp_Object Vchar_unify_table;
58
c57f3328
KH
59static Lisp_Object Qchar_script_table;
60
0168c3d8
KH
61\f
62
2bde7652
KH
63/* If character code C has modifier masks, reflect them to the
64 character code if possible. Return the resulting code. */
65
d311d28c
PE
66EMACS_INT
67char_resolve_modifier_mask (EMACS_INT c)
2bde7652 68{
d0363d44 69 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
70 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
71 return c;
72
73 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
74 if (c & CHAR_SHIFT)
75 {
76 /* Shift modifier is valid only with [A-Za-z]. */
77 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
78 c &= ~CHAR_SHIFT;
79 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
80 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
81 /* Shift modifier for control characters and SPC is ignored. */
82 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
83 c &= ~CHAR_SHIFT;
84 }
2bde7652
KH
85 if (c & CHAR_CTL)
86 {
87 /* Simulate the code in lread.c. */
88 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
89 if ((c & 0377) == ' ')
90 c &= ~0177 & ~ CHAR_CTL;
91 else if ((c & 0377) == '?')
92 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
93 /* ASCII control chars are made from letters (both cases),
94 as well as the non-letters within 0100...0137. */
95 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
96 c &= (037 | (~0177 & ~CHAR_CTL));
97 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
98 c &= (037 | (~0177 & ~CHAR_CTL));
99 }
e9c1637d 100#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
101 if (c & CHAR_META)
102 {
103 /* Move the meta bit to the right place for a string. */
104 c = (c & ~CHAR_META) | 0x80;
105 }
e9c1637d 106#endif
2bde7652
KH
107
108 return c;
109}
110
111
33f91981
KH
112/* Store multibyte form of character C at P. If C has modifier bits,
113 handle them appropriately. */
114
0168c3d8 115int
971de7fb 116char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
117{
118 int bytes;
119
e3d8eb8c
KH
120 if (c & CHAR_MODIFIER_MASK)
121 {
c5958d4c 122 c = char_resolve_modifier_mask (c);
e3d8eb8c
KH
123 /* If C still has any modifier bits, just ignore it. */
124 c &= ~CHAR_MODIFIER_MASK;
125 }
126
e3d8eb8c 127 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
128 {
129 bytes = CHAR_STRING (c, p);
130 }
131 else if (c <= MAX_4_BYTE_CHAR)
132 {
133 p[0] = (0xF0 | (c >> 18));
134 p[1] = (0x80 | ((c >> 12) & 0x3F));
135 p[2] = (0x80 | ((c >> 6) & 0x3F));
136 p[3] = (0x80 | (c & 0x3F));
137 bytes = 4;
138 }
e3d8eb8c 139 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
140 {
141 p[0] = 0xF8;
142 p[1] = (0x80 | ((c >> 18) & 0x0F));
143 p[2] = (0x80 | ((c >> 12) & 0x3F));
144 p[3] = (0x80 | ((c >> 6) & 0x3F));
145 p[4] = (0x80 | (c & 0x3F));
146 bytes = 5;
147 }
5aa91c9b 148 else if (c <= MAX_CHAR)
e3d8eb8c
KH
149 {
150 c = CHAR_TO_BYTE8 (c);
151 bytes = BYTE8_STRING (c, p);
152 }
5aa91c9b 153 else
e6c3da20 154 error ("Invalid character: %x", c);
1889b238 155
0168c3d8
KH
156 return bytes;
157}
158
159
224a3131 160/* Return a character whose multibyte form is at P. If LEN is not
33f91981 161 NULL, it must be a pointer to integer. In that case, set *LEN to
224a3131 162 the byte length of the multibyte form. If ADVANCED is not NULL, it
33f91981 163 must be a pointer to unsigned char. In that case, set *ADVANCED to
224a3131 164 the ending address (i.e., the starting address of the next
33f91981
KH
165 character) of the multibyte form. */
166
0168c3d8 167int
971de7fb 168string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 169{
1889b238 170 int c;
15843e6f 171 const unsigned char *saved_p = p;
0168c3d8
KH
172
173 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
174 {
3f246b65 175 /* 1-, 2-, and 3-byte sequences can be handled by the macro. */
0168c3d8
KH
176 c = STRING_CHAR_ADVANCE (p);
177 }
178 else if (! (*p & 0x08))
179 {
3f246b65
KH
180 /* A 4-byte sequence of this form:
181 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
182 c = ((((p)[0] & 0x7) << 18)
0168c3d8
KH
183 | (((p)[1] & 0x3F) << 12)
184 | (((p)[2] & 0x3F) << 6)
185 | ((p)[3] & 0x3F));
186 p += 4;
187 }
188 else
189 {
3f246b65
KH
190 /* A 5-byte sequence of this form:
191
192 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
193
194 Note that the top 4 `x's are always 0, so shifting p[1] can
195 never exceed the maximum valid character codepoint. */
196 c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
197 (((p)[1] & 0x3F) << 18)
0168c3d8
KH
198 | (((p)[2] & 0x3F) << 12)
199 | (((p)[3] & 0x3F) << 6)
200 | ((p)[4] & 0x3F));
201 p += 5;
202 }
203
0168c3d8
KH
204 if (len)
205 *len = p - saved_p;
206 if (advanced)
207 *advanced = p;
208 return c;
209}
210
211
224a3131
EZ
212/* Translate character C by translation table TABLE. If no translation is
213 found in TABLE, return the untranslated character. If TABLE is a list,
214 elements are char tables. In that case, recursively translate C by all the
215 tables in the list. */
0168c3d8
KH
216
217int
971de7fb 218translate_char (Lisp_Object table, int c)
0168c3d8 219{
10453be9
KH
220 if (CHAR_TABLE_P (table))
221 {
222 Lisp_Object ch;
223
224 ch = CHAR_TABLE_REF (table, c);
225 if (CHARACTERP (ch))
226 c = XINT (ch);
227 }
228 else
229 {
230 for (; CONSP (table); table = XCDR (table))
231 c = translate_char (XCAR (table), c);
232 }
233 return c;
0168c3d8
KH
234}
235
0168c3d8 236DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
bc985c87 237 doc: /* Return non-nil if OBJECT is a character.
4abcdac8
CY
238In Emacs Lisp, characters are represented by character codes, which
239are non-negative integers. The function `max-char' returns the
240maximum character code.
bc985c87 241usage: (characterp OBJECT) */)
5842a27b 242 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
243{
244 return (CHARACTERP (object) ? Qt : Qnil);
245}
246
247DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
248 doc: /* Return the character of the maximum code. */)
5842a27b 249 (void)
0168c3d8
KH
250{
251 return make_number (MAX_CHAR);
252}
253
254DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
255 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 256 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 257 (Lisp_Object ch)
0168c3d8
KH
258{
259 int c;
0168c3d8
KH
260
261 CHECK_CHARACTER (ch);
262 c = XFASTINT (ch);
2e5db15c
KH
263 if (c >= 0x100)
264 error ("Not a unibyte character: %d", c);
4c0354d7 265 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
266 return make_number (c);
267}
268
269DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
270 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
271 doc: /* Convert the multibyte character CH to a byte.
272If the multibyte character does not represent a byte, return -1. */)
5842a27b 273 (Lisp_Object ch)
0168c3d8 274{
5556875b 275 int cm;
0168c3d8
KH
276
277 CHECK_CHARACTER (ch);
5556875b
SM
278 cm = XFASTINT (ch);
279 if (cm < 256)
280 /* Can't distinguish a byte read from a unibyte buffer from
281 a latin1 char, so let's let it slide. */
282 return ch;
283 else
284 {
2afc21f5 285 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
286 return make_number (cu);
287 }
0168c3d8
KH
288}
289
25ed9e61
KH
290
291/* Return width (columns) of C considering the buffer display table DP. */
292
6e6c82a4 293static ptrdiff_t
25ed9e61
KH
294char_width (int c, struct Lisp_Char_Table *dp)
295{
6e6c82a4 296 ptrdiff_t width = CHAR_WIDTH (c);
25ed9e61
KH
297
298 if (dp)
299 {
300 Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
301 int i;
302
303 if (VECTORP (disp))
304 for (i = 0, width = 0; i < ASIZE (disp); i++)
305 {
306 ch = AREF (disp, i);
307 if (CHARACTERP (ch))
df0b2940
PE
308 {
309 int w = CHAR_WIDTH (XFASTINT (ch));
310 if (INT_ADD_OVERFLOW (width, w))
311 string_overflow ();
312 width += w;
313 }
25ed9e61
KH
314 }
315 }
316 return width;
317}
318
319
a7ca3326 320DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
0168c3d8
KH
321 doc: /* Return width of CHAR when displayed in the current buffer.
322The width is measured by how many columns it occupies on the screen.
add553ac
JB
323Tab is taken to occupy `tab-width' columns.
324usage: (char-width CHAR) */)
5842a27b 325 (Lisp_Object ch)
0168c3d8 326{
df0b2940 327 int c;
6e6c82a4 328 ptrdiff_t width;
0168c3d8
KH
329
330 CHECK_CHARACTER (ch);
331 c = XINT (ch);
25ed9e61 332 width = char_width (c, buffer_display_table ());
0168c3d8
KH
333 return make_number (width);
334}
335
0168c3d8
KH
336/* Return width of string STR of length LEN when displayed in the
337 current buffer. The width is measured by how many columns it
338 occupies on the screen. If PRECISION > 0, return the width of
339 longest substring that doesn't exceed PRECISION, and set number of
340 characters and bytes of the substring in *NCHARS and *NBYTES
341 respectively. */
342
d311d28c
PE
343ptrdiff_t
344c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
345 ptrdiff_t *nchars, ptrdiff_t *nbytes)
0168c3d8 346{
d311d28c
PE
347 ptrdiff_t i = 0, i_byte = 0;
348 ptrdiff_t width = 0;
0168c3d8
KH
349 struct Lisp_Char_Table *dp = buffer_display_table ();
350
351 while (i_byte < len)
352 {
25ed9e61 353 int bytes;
62a6e103 354 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
6e6c82a4 355 ptrdiff_t thiswidth = char_width (c, dp);
0168c3d8 356
df0b2940 357 if (precision <= 0)
0168c3d8 358 {
df0b2940
PE
359 if (INT_ADD_OVERFLOW (width, thiswidth))
360 string_overflow ();
0168c3d8 361 }
df0b2940 362 else if (precision - width < thiswidth)
0168c3d8
KH
363 {
364 *nchars = i;
365 *nbytes = i_byte;
366 return width;
367 }
368 i++;
369 i_byte += bytes;
370 width += thiswidth;
371 }
372
373 if (precision > 0)
374 {
375 *nchars = i;
376 *nbytes = i_byte;
377 }
378
379 return width;
380}
381
1889b238
KH
382/* Return width of string STR of length LEN when displayed in the
383 current buffer. The width is measured by how many columns it
384 occupies on the screen. */
385
d311d28c
PE
386ptrdiff_t
387strwidth (const char *str, ptrdiff_t len)
1889b238 388{
7469ef5d 389 return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
1889b238
KH
390}
391
0168c3d8
KH
392/* Return width of Lisp string STRING when displayed in the current
393 buffer. The width is measured by how many columns it occupies on
394 the screen while paying attention to compositions. If PRECISION >
395 0, return the width of longest substring that doesn't exceed
396 PRECISION, and set number of characters and bytes of the substring
397 in *NCHARS and *NBYTES respectively. */
398
d311d28c
PE
399ptrdiff_t
400lisp_string_width (Lisp_Object string, ptrdiff_t precision,
401 ptrdiff_t *nchars, ptrdiff_t *nbytes)
0168c3d8 402{
d311d28c 403 ptrdiff_t len = SCHARS (string);
0aee65b9
KH
404 /* This set multibyte to 0 even if STRING is multibyte when it
405 contains only ascii and eight-bit-graphic, but that's
406 intentional. */
d5172d4f 407 bool multibyte = len < SBYTES (string);
8f924df7 408 unsigned char *str = SDATA (string);
d311d28c
PE
409 ptrdiff_t i = 0, i_byte = 0;
410 ptrdiff_t width = 0;
0168c3d8
KH
411 struct Lisp_Char_Table *dp = buffer_display_table ();
412
413 while (i < len)
414 {
d311d28c 415 ptrdiff_t chars, bytes, thiswidth;
0168c3d8 416 Lisp_Object val;
ebfa62c0 417 ptrdiff_t cmp_id;
d311d28c 418 ptrdiff_t ignore, end;
0168c3d8
KH
419
420 if (find_composition (i, -1, &ignore, &end, &val, string)
421 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
422 >= 0))
423 {
424 thiswidth = composition_table[cmp_id]->width;
425 chars = end - i;
426 bytes = string_char_to_byte (string, end) - i_byte;
427 }
0168c3d8
KH
428 else
429 {
0aee65b9 430 int c;
0168c3d8 431
0aee65b9 432 if (multibyte)
2b4560a8
PE
433 {
434 int cbytes;
435 c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
436 bytes = cbytes;
437 }
0aee65b9
KH
438 else
439 c = str[i_byte], bytes = 1;
0168c3d8 440 chars = 1;
25ed9e61 441 thiswidth = char_width (c, dp);
0168c3d8
KH
442 }
443
2b4560a8
PE
444 if (precision <= 0)
445 {
446#ifdef emacs
447 if (INT_ADD_OVERFLOW (width, thiswidth))
448 string_overflow ();
449#endif
450 }
451 else if (precision - width < thiswidth)
0168c3d8
KH
452 {
453 *nchars = i;
454 *nbytes = i_byte;
455 return width;
456 }
457 i += chars;
458 i_byte += bytes;
459 width += thiswidth;
ef3ff036 460 }
0168c3d8
KH
461
462 if (precision > 0)
463 {
464 *nchars = i;
465 *nbytes = i_byte;
466 }
467
468 return width;
469}
470
471DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
472 doc: /* Return width of STRING when displayed in the current buffer.
473Width is measured by how many columns it occupies on the screen.
474When calculating width of a multibyte character in STRING,
475only the base leading-code is considered; the validity of
476the following bytes is not checked. Tabs in STRING are always
add553ac
JB
477taken to occupy `tab-width' columns.
478usage: (string-width STRING) */)
5842a27b 479 (Lisp_Object str)
0168c3d8
KH
480{
481 Lisp_Object val;
482
483 CHECK_STRING (str);
484 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
485 return val;
486}
487
0168c3d8
KH
488/* Return the number of characters in the NBYTES bytes at PTR.
489 This works by looking at the contents and checking for multibyte
490 sequences while assuming that there's no invalid sequence.
491 However, if the current buffer has enable-multibyte-characters =
492 nil, we treat each byte as a character. */
493
d311d28c
PE
494ptrdiff_t
495chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
0168c3d8
KH
496{
497 /* current_buffer is null at early stages of Emacs initialization. */
498 if (current_buffer == 0
4b4deea2 499 || NILP (BVAR (current_buffer, enable_multibyte_characters)))
0168c3d8
KH
500 return nbytes;
501
502 return multibyte_chars_in_text (ptr, nbytes);
503}
504
505/* Return the number of characters in the NBYTES bytes at PTR.
506 This works by looking at the contents and checking for multibyte
507 sequences while assuming that there's no invalid sequence. It
508 ignores enable-multibyte-characters. */
509
d311d28c
PE
510ptrdiff_t
511multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
0168c3d8 512{
8f924df7 513 const unsigned char *endp = ptr + nbytes;
d311d28c 514 ptrdiff_t chars = 0;
0168c3d8
KH
515
516 while (ptr < endp)
517 {
d311d28c 518 int len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
519
520 if (len == 0)
1088b922 521 emacs_abort ();
0168c3d8
KH
522 ptr += len;
523 chars++;
524 }
525
526 return chars;
527}
528
529/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
530 characters and bytes in it, and store them in *NCHARS and *NBYTES
531 respectively. On counting bytes, pay attention to that 8-bit
532 characters not constructing a valid multibyte sequence are
533 represented by 2-byte in a multibyte text. */
534
535void
d311d28c
PE
536parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
537 ptrdiff_t *nchars, ptrdiff_t *nbytes)
0168c3d8 538{
8f924df7 539 const unsigned char *endp = str + len;
d311d28c
PE
540 int n;
541 ptrdiff_t chars = 0, bytes = 0;
0168c3d8
KH
542
543 if (len >= MAX_MULTIBYTE_LENGTH)
544 {
8f924df7 545 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
546 while (str < adjusted_endp)
547 {
fc9a17bc
KH
548 if (! CHAR_BYTE8_HEAD_P (*str)
549 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
550 str += n, bytes += n;
551 else
552 str++, bytes += 2;
553 chars++;
554 }
555 }
556 while (str < endp)
557 {
fc9a17bc
KH
558 if (! CHAR_BYTE8_HEAD_P (*str)
559 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
560 str += n, bytes += n;
561 else
562 str++, bytes += 2;
563 chars++;
564 }
565
566 *nchars = chars;
567 *nbytes = bytes;
568 return;
569}
570
571/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
da6062e6 572 It actually converts only such 8-bit characters that don't construct
0168c3d8
KH
573 a multibyte sequence to multibyte forms of Latin-1 characters. If
574 NCHARS is nonzero, set *NCHARS to the number of characters in the
575 text. It is assured that we can use LEN bytes at STR as a work
576 area and that is enough. Return the number of bytes of the
577 resulting text. */
578
d311d28c
PE
579ptrdiff_t
580str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
581 ptrdiff_t *nchars)
0168c3d8
KH
582{
583 unsigned char *p = str, *endp = str + nbytes;
584 unsigned char *to;
d311d28c 585 ptrdiff_t chars = 0;
0168c3d8
KH
586 int n;
587
588 if (nbytes >= MAX_MULTIBYTE_LENGTH)
589 {
590 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
591 while (p < adjusted_endp
fc9a17bc 592 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
593 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
594 p += n, chars++;
595 }
fc9a17bc
KH
596 while (p < endp
597 && ! CHAR_BYTE8_HEAD_P (*p)
598 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
599 p += n, chars++;
600 if (nchars)
601 *nchars = chars;
602 if (p == endp)
603 return nbytes;
604
605 to = p;
606 nbytes = endp - p;
607 endp = str + len;
72af86bd 608 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
609 p = endp - nbytes;
610
611 if (nbytes >= MAX_MULTIBYTE_LENGTH)
612 {
613 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
614 while (p < adjusted_endp)
615 {
fc9a17bc
KH
616 if (! CHAR_BYTE8_HEAD_P (*p)
617 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
618 {
619 while (n--)
620 *to++ = *p++;
621 }
622 else
623 {
624 int c = *p++;
625 c = BYTE8_TO_CHAR (c);
626 to += CHAR_STRING (c, to);
627 }
628 }
629 chars++;
630 }
631 while (p < endp)
632 {
fc9a17bc
KH
633 if (! CHAR_BYTE8_HEAD_P (*p)
634 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
635 {
636 while (n--)
637 *to++ = *p++;
8f924df7 638 }
0168c3d8
KH
639 else
640 {
641 int c = *p++;
642 c = BYTE8_TO_CHAR (c);
643 to += CHAR_STRING (c, to);
644 }
645 chars++;
646 }
647 if (nchars)
648 *nchars = chars;
649 return (to - str);
650}
651
652/* Parse unibyte string at STR of LEN bytes, and return the number of
e1dbe924 653 bytes it may occupy when converted to multibyte string by
0168c3d8
KH
654 `str_to_multibyte'. */
655
d311d28c
PE
656ptrdiff_t
657count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
0168c3d8 658{
eec47d6b 659 const unsigned char *endp = str + len;
d311d28c 660 ptrdiff_t bytes;
0168c3d8
KH
661
662 for (bytes = 0; str < endp; str++)
de883a70
PE
663 {
664 int n = *str < 0x80 ? 1 : 2;
665 if (INT_ADD_OVERFLOW (bytes, n))
666 string_overflow ();
667 bytes += n;
668 }
0168c3d8
KH
669 return bytes;
670}
671
672
ef3ff036 673/* Convert unibyte text at STR of BYTES bytes to a multibyte text
0168c3d8
KH
674 that contains the same single-byte characters. It actually
675 converts all 8-bit characters to multibyte forms. It is assured
676 that we can use LEN bytes at STR as a work area and that is
677 enough. */
678
d311d28c
PE
679ptrdiff_t
680str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
0168c3d8
KH
681{
682 unsigned char *p = str, *endp = str + bytes;
683 unsigned char *to;
684
685 while (p < endp && *p < 0x80) p++;
686 if (p == endp)
687 return bytes;
688 to = p;
689 bytes = endp - p;
690 endp = str + len;
72af86bd 691 memmove (endp - bytes, p, bytes);
0168c3d8 692 p = endp - bytes;
8f924df7 693 while (p < endp)
0168c3d8
KH
694 {
695 int c = *p++;
696
697 if (c >= 0x80)
698 c = BYTE8_TO_CHAR (c);
699 to += CHAR_STRING (c, to);
700 }
701 return (to - str);
702}
703
704/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
705 actually converts characters in the range 0x80..0xFF to
706 unibyte. */
707
d311d28c
PE
708ptrdiff_t
709str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
0168c3d8 710{
15843e6f
KH
711 const unsigned char *p = str, *endp = str + bytes;
712 unsigned char *to;
0168c3d8
KH
713 int c, len;
714
715 while (p < endp)
716 {
717 c = *p;
718 len = BYTES_BY_CHAR_HEAD (c);
719 if (CHAR_BYTE8_HEAD_P (c))
720 break;
721 p += len;
722 }
15843e6f 723 to = str + (p - str);
8f924df7 724 while (p < endp)
0168c3d8
KH
725 {
726 c = *p;
727 len = BYTES_BY_CHAR_HEAD (c);
728 if (CHAR_BYTE8_HEAD_P (c))
729 {
730 c = STRING_CHAR_ADVANCE (p);
731 *to++ = CHAR_TO_BYTE8 (c);
732 }
733 else
734 {
735 while (len--) *to++ = *p++;
736 }
737 }
738 return (to - str);
739}
740
4aa40bb8
KH
741/* Convert eight-bit chars in SRC (in multibyte form) to the
742 corresponding byte and store in DST. CHARS is the number of
743 characters in SRC. The value is the number of bytes stored in DST.
744 Usually, the value is the same as CHARS, but is less than it if SRC
d5172d4f 745 contains a non-ASCII, non-eight-bit character. */
4aa40bb8 746
d311d28c 747ptrdiff_t
d5172d4f 748str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
4aa40bb8 749{
d311d28c 750 ptrdiff_t i;
4aa40bb8
KH
751
752 for (i = 0; i < chars; i++)
753 {
754 int c = STRING_CHAR_ADVANCE (src);
755
756 if (CHAR_BYTE8_P (c))
757 c = CHAR_TO_BYTE8 (c);
d5172d4f 758 else if (! ASCII_CHAR_P (c))
4aa40bb8
KH
759 return i;
760 *dst++ = c;
761 }
762 return i;
763}
764
765
d311d28c 766static ptrdiff_t
971de7fb 767string_count_byte8 (Lisp_Object string)
0168c3d8 768{
d5172d4f 769 bool multibyte = STRING_MULTIBYTE (string);
d311d28c 770 ptrdiff_t nbytes = SBYTES (string);
8f924df7 771 unsigned char *p = SDATA (string);
0168c3d8 772 unsigned char *pend = p + nbytes;
d311d28c 773 ptrdiff_t count = 0;
0168c3d8
KH
774 int c, len;
775
776 if (multibyte)
777 while (p < pend)
778 {
779 c = *p;
780 len = BYTES_BY_CHAR_HEAD (c);
781
782 if (CHAR_BYTE8_HEAD_P (c))
783 count++;
784 p += len;
785 }
786 else
787 while (p < pend)
788 {
789 if (*p++ >= 0x80)
790 count++;
791 }
792 return count;
793}
794
795
796Lisp_Object
971de7fb 797string_escape_byte8 (Lisp_Object string)
0168c3d8 798{
d311d28c
PE
799 ptrdiff_t nchars = SCHARS (string);
800 ptrdiff_t nbytes = SBYTES (string);
d5172d4f 801 bool multibyte = STRING_MULTIBYTE (string);
d311d28c 802 ptrdiff_t byte8_count;
15843e6f
KH
803 const unsigned char *src, *src_end;
804 unsigned char *dst;
0168c3d8
KH
805 Lisp_Object val;
806 int c, len;
807
808 if (multibyte && nchars == nbytes)
809 return string;
810
811 byte8_count = string_count_byte8 (string);
812
813 if (byte8_count == 0)
814 return string;
815
816 if (multibyte)
14162469
EZ
817 {
818 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
c9d624c6 819 || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
cb93f9be 820 string_overflow ();
14162469
EZ
821
822 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
823 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
824 nbytes + byte8_count * 2);
825 }
0168c3d8 826 else
14162469 827 {
c9d624c6 828 if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
cb93f9be 829 string_overflow ();
ef3ff036 830
14162469
EZ
831 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
832 val = make_uninit_string (nbytes + byte8_count * 3);
833 }
0168c3d8 834
8f924df7 835 src = SDATA (string);
0168c3d8 836 src_end = src + nbytes;
8f924df7 837 dst = SDATA (val);
0168c3d8
KH
838 if (multibyte)
839 while (src < src_end)
840 {
841 c = *src;
842 len = BYTES_BY_CHAR_HEAD (c);
843
844 if (CHAR_BYTE8_HEAD_P (c))
845 {
846 c = STRING_CHAR_ADVANCE (src);
847 c = CHAR_TO_BYTE8 (c);
99027bdd 848 dst += sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
849 }
850 else
851 while (len--) *dst++ = *src++;
852 }
853 else
854 while (src < src_end)
855 {
856 c = *src++;
857 if (c >= 0x80)
99027bdd 858 dst += sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
859 else
860 *dst++ = c;
861 }
862 return val;
863}
864
865\f
a7ca3326 866DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 867 doc: /*
d2e83296
DL
868Concatenate all the argument characters and make the result a string.
869usage: (string &rest CHARACTERS) */)
f66c7cf8 870 (ptrdiff_t n, Lisp_Object *args)
0168c3d8 871{
f66c7cf8 872 ptrdiff_t i;
c5101a77 873 int c;
754790b6
CY
874 unsigned char *buf, *p;
875 Lisp_Object str;
876 USE_SAFE_ALLOCA;
877
0065d054 878 SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
754790b6 879 p = buf;
0168c3d8
KH
880
881 for (i = 0; i < n; i++)
882 {
883 CHECK_CHARACTER (args[i]);
884 c = XINT (args[i]);
885 p += CHAR_STRING (c, p);
886 }
887
754790b6
CY
888 str = make_string_from_bytes ((char *) buf, n, p - buf);
889 SAFE_FREE ();
890 return str;
0168c3d8
KH
891}
892
70b4969d 893DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
894 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
895usage: (unibyte-string &rest BYTES) */)
f66c7cf8 896 (ptrdiff_t n, Lisp_Object *args)
70b4969d 897{
f66c7cf8 898 ptrdiff_t i;
754790b6
CY
899 Lisp_Object str;
900 USE_SAFE_ALLOCA;
98c6f1e3
PE
901 unsigned char *buf = SAFE_ALLOCA (n);
902 unsigned char *p = buf;
70b4969d
KH
903
904 for (i = 0; i < n; i++)
905 {
af5a5a98 906 CHECK_RANGED_INTEGER (args[i], 0, 255);
a14e1568 907 *p++ = XINT (args[i]);
70b4969d
KH
908 }
909
754790b6
CY
910 str = make_string_from_bytes ((char *) buf, n, p - buf);
911 SAFE_FREE ();
912 return str;
70b4969d
KH
913}
914
c73ae4ae 915DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
916 Schar_resolve_modifiers, 1, 1, 0,
917 doc: /* Resolve modifiers in the character CHAR.
918The value is a character with modifiers resolved into the character
919code. Unresolved modifiers are kept in the value.
c73ae4ae 920usage: (char-resolve-modifiers CHAR) */)
5842a27b 921 (Lisp_Object character)
d0363d44 922{
d311d28c 923 EMACS_INT c;
d0363d44
KH
924
925 CHECK_NUMBER (character);
926 c = XINT (character);
927 return make_number (char_resolve_modifier_mask (c));
928}
929
ee107a89
KH
930DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
931 doc: /* Return a byte value of a character at point.
932Optional 1st arg POSITION, if non-nil, is a position of a character to get
933a byte value.
934Optional 2nd arg STRING, if non-nil, is a string of which first
935character is a target to get a byte value. In this case, POSITION, if
936non-nil, is an index of a target character in the string.
937
938If the current buffer (or STRING) is multibyte, and the target
8350f087 939character is not ASCII nor 8-bit character, an error is signaled. */)
5842a27b 940 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
941{
942 int c;
d311d28c 943 ptrdiff_t pos;
ee107a89
KH
944 unsigned char *p;
945
946 if (NILP (string))
947 {
948 if (NILP (position))
949 {
950 p = PT_ADDR;
4d8e170e 951 }
ee107a89
KH
952 else
953 {
954 CHECK_NUMBER_COERCE_MARKER (position);
955 if (XINT (position) < BEGV || XINT (position) >= ZV)
956 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
957 pos = XFASTINT (position);
958 p = CHAR_POS_ADDR (pos);
959 }
4b4deea2 960 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
d5998e03 961 return make_number (*p);
ee107a89
KH
962 }
963 else
964 {
965 CHECK_STRING (string);
966 if (NILP (position))
967 {
968 p = SDATA (string);
969 }
970 else
971 {
972 CHECK_NATNUM (position);
973 if (XINT (position) >= SCHARS (string))
974 args_out_of_range (string, position);
975 pos = XFASTINT (position);
976 p = SDATA (string) + string_char_to_byte (string, pos);
977 }
d5998e03
KH
978 if (! STRING_MULTIBYTE (string))
979 return make_number (*p);
ee107a89 980 }
62a6e103 981 c = STRING_CHAR (p);
ee107a89
KH
982 if (CHAR_BYTE8_P (c))
983 c = CHAR_TO_BYTE8 (c);
984 else if (! ASCII_CHAR_P (c))
985 error ("Not an ASCII nor an 8-bit character: %d", c);
986 return make_number (c);
987}
988
0168c3d8
KH
989#ifdef emacs
990
991void
971de7fb 992syms_of_character (void)
0168c3d8
KH
993{
994 DEFSYM (Qcharacterp, "characterp");
995 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
996
997 staticpro (&Vchar_unify_table);
998 Vchar_unify_table = Qnil;
999
1000 defsubr (&Smax_char);
1001 defsubr (&Scharacterp);
1002 defsubr (&Sunibyte_char_to_multibyte);
1003 defsubr (&Smultibyte_char_to_unibyte);
0168c3d8
KH
1004 defsubr (&Schar_width);
1005 defsubr (&Sstring_width);
0168c3d8 1006 defsubr (&Sstring);
70b4969d 1007 defsubr (&Sunibyte_string);
d0363d44 1008 defsubr (&Schar_resolve_modifiers);
ee107a89 1009 defsubr (&Sget_byte);
0168c3d8 1010
29208e82 1011 DEFVAR_LISP ("translation-table-vector", Vtranslation_table_vector,
0168c3d8 1012 doc: /*
68978cf0
DL
1013Vector recording all translation tables ever defined.
1014Each element is a pair (SYMBOL . TABLE) relating the table to the
1015symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1016 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1017
29208e82 1018 DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
0168c3d8
KH
1019 doc: /*
1020A char-table for characters which invoke auto-filling.
1021Such characters have value t in this table. */);
1022 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1023 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1024 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8 1025
29208e82 1026 DEFVAR_LISP ("char-width-table", Vchar_width_table,
0168c3d8
KH
1027 doc: /*
1028A char-table for width (columns) of each character. */);
1029 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1030 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1031 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1032 make_number (4));
0168c3d8 1033
29208e82 1034 DEFVAR_LISP ("printable-chars", Vprintable_chars,
0168c3d8 1035 doc: /* A char-table for each printable character. */);
db6d4189 1036 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1037 Fset_char_table_range (Vprintable_chars,
1038 Fcons (make_number (32), make_number (126)), Qt);
1039 Fset_char_table_range (Vprintable_chars,
1040 Fcons (make_number (160),
1041 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1042
29208e82 1043 DEFVAR_LISP ("char-script-table", Vchar_script_table,
c57f3328
KH
1044 doc: /* Char table of script symbols.
1045It has one extra slot whose value is a list of script symbols. */);
1046
c57f3328
KH
1047 DEFSYM (Qchar_script_table, "char-script-table");
1048 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1049 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352 1050
29208e82 1051 DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
c3bb7671 1052 doc: /* Alist of scripts vs the representative characters.
f4427a54 1053Each element is a cons (SCRIPT . CHARS).
4d8e170e 1054SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1055CHARS is a list or a vector of characters.
472a4dc9 1056If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1057If it is a vector, one of the characters in the vector is necessary.
1058This variable is used to find a font for a specific script. */);
c7e14352 1059 Vscript_representative_chars = Qnil;
a3cbb631 1060
29208e82 1061 DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
a3cbb631 1062 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1063All Unicode characters have one of the following values (symbol):
1064 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1065 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1066See The Unicode Standard for the meaning of those values. */);
1067 /* The correct char-table is setup in characters.el. */
1068 Vunicode_category_table = Qnil;
0168c3d8
KH
1069}
1070
1071#endif /* emacs */