Fixes: debbugs:17865
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8 1/* Basic character support.
73b0cd50 2
ba318903 3Copyright (C) 2001-2014 Free Software Foundation, Inc.
73b0cd50
GM
4Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
0168c3d8
KH
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
2b4560a8 37#include <intprops.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
955cbe7b 53static Lisp_Object Qauto_fill_chars;
0168c3d8 54
33f91981 55/* Char-table of information about which character to unify to which
6f1b43a0 56 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
57Lisp_Object Vchar_unify_table;
58
c57f3328
KH
59static Lisp_Object Qchar_script_table;
60
0168c3d8
KH
61\f
62
2bde7652
KH
63/* If character code C has modifier masks, reflect them to the
64 character code if possible. Return the resulting code. */
65
d311d28c
PE
66EMACS_INT
67char_resolve_modifier_mask (EMACS_INT c)
2bde7652 68{
d0363d44 69 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
70 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
71 return c;
72
73 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
74 if (c & CHAR_SHIFT)
75 {
76 /* Shift modifier is valid only with [A-Za-z]. */
77 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
78 c &= ~CHAR_SHIFT;
79 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
80 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
81 /* Shift modifier for control characters and SPC is ignored. */
82 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
83 c &= ~CHAR_SHIFT;
84 }
2bde7652
KH
85 if (c & CHAR_CTL)
86 {
87 /* Simulate the code in lread.c. */
88 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
89 if ((c & 0377) == ' ')
90 c &= ~0177 & ~ CHAR_CTL;
91 else if ((c & 0377) == '?')
92 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
93 /* ASCII control chars are made from letters (both cases),
94 as well as the non-letters within 0100...0137. */
95 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
96 c &= (037 | (~0177 & ~CHAR_CTL));
97 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
98 c &= (037 | (~0177 & ~CHAR_CTL));
99 }
e9c1637d 100#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
101 if (c & CHAR_META)
102 {
103 /* Move the meta bit to the right place for a string. */
104 c = (c & ~CHAR_META) | 0x80;
105 }
e9c1637d 106#endif
2bde7652
KH
107
108 return c;
109}
110
111
33f91981
KH
112/* Store multibyte form of character C at P. If C has modifier bits,
113 handle them appropriately. */
114
0168c3d8 115int
971de7fb 116char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
117{
118 int bytes;
119
e3d8eb8c
KH
120 if (c & CHAR_MODIFIER_MASK)
121 {
c5958d4c 122 c = char_resolve_modifier_mask (c);
e3d8eb8c
KH
123 /* If C still has any modifier bits, just ignore it. */
124 c &= ~CHAR_MODIFIER_MASK;
125 }
126
e3d8eb8c 127 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
128 {
129 bytes = CHAR_STRING (c, p);
130 }
131 else if (c <= MAX_4_BYTE_CHAR)
132 {
133 p[0] = (0xF0 | (c >> 18));
134 p[1] = (0x80 | ((c >> 12) & 0x3F));
135 p[2] = (0x80 | ((c >> 6) & 0x3F));
136 p[3] = (0x80 | (c & 0x3F));
137 bytes = 4;
138 }
e3d8eb8c 139 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
140 {
141 p[0] = 0xF8;
142 p[1] = (0x80 | ((c >> 18) & 0x0F));
143 p[2] = (0x80 | ((c >> 12) & 0x3F));
144 p[3] = (0x80 | ((c >> 6) & 0x3F));
145 p[4] = (0x80 | (c & 0x3F));
146 bytes = 5;
147 }
5aa91c9b 148 else if (c <= MAX_CHAR)
e3d8eb8c
KH
149 {
150 c = CHAR_TO_BYTE8 (c);
151 bytes = BYTE8_STRING (c, p);
152 }
5aa91c9b 153 else
e6c3da20 154 error ("Invalid character: %x", c);
1889b238 155
0168c3d8
KH
156 return bytes;
157}
158
159
224a3131 160/* Return a character whose multibyte form is at P. If LEN is not
33f91981 161 NULL, it must be a pointer to integer. In that case, set *LEN to
224a3131 162 the byte length of the multibyte form. If ADVANCED is not NULL, it
33f91981 163 must be a pointer to unsigned char. In that case, set *ADVANCED to
224a3131 164 the ending address (i.e., the starting address of the next
33f91981
KH
165 character) of the multibyte form. */
166
0168c3d8 167int
971de7fb 168string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 169{
1889b238 170 int c;
15843e6f 171 const unsigned char *saved_p = p;
0168c3d8
KH
172
173 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
174 {
3f246b65 175 /* 1-, 2-, and 3-byte sequences can be handled by the macro. */
0168c3d8
KH
176 c = STRING_CHAR_ADVANCE (p);
177 }
178 else if (! (*p & 0x08))
179 {
3f246b65
KH
180 /* A 4-byte sequence of this form:
181 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
182 c = ((((p)[0] & 0x7) << 18)
0168c3d8
KH
183 | (((p)[1] & 0x3F) << 12)
184 | (((p)[2] & 0x3F) << 6)
185 | ((p)[3] & 0x3F));
186 p += 4;
187 }
188 else
189 {
3f246b65
KH
190 /* A 5-byte sequence of this form:
191
192 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
193
194 Note that the top 4 `x's are always 0, so shifting p[1] can
195 never exceed the maximum valid character codepoint. */
196 c = (/* (((p)[0] & 0x3) << 24) ... always 0, so no need to shift. */
197 (((p)[1] & 0x3F) << 18)
0168c3d8
KH
198 | (((p)[2] & 0x3F) << 12)
199 | (((p)[3] & 0x3F) << 6)
200 | ((p)[4] & 0x3F));
201 p += 5;
202 }
203
0168c3d8
KH
204 if (len)
205 *len = p - saved_p;
206 if (advanced)
207 *advanced = p;
208 return c;
209}
210
211
224a3131
EZ
212/* Translate character C by translation table TABLE. If no translation is
213 found in TABLE, return the untranslated character. If TABLE is a list,
214 elements are char tables. In that case, recursively translate C by all the
215 tables in the list. */
0168c3d8
KH
216
217int
971de7fb 218translate_char (Lisp_Object table, int c)
0168c3d8 219{
10453be9
KH
220 if (CHAR_TABLE_P (table))
221 {
222 Lisp_Object ch;
223
224 ch = CHAR_TABLE_REF (table, c);
225 if (CHARACTERP (ch))
226 c = XINT (ch);
227 }
228 else
229 {
230 for (; CONSP (table); table = XCDR (table))
231 c = translate_char (XCAR (table), c);
232 }
233 return c;
0168c3d8
KH
234}
235
2e5db15c 236/* Convert ASCII or 8-bit character C to unibyte. If C is none of
461c2ab9 237 them, return (C & 0xFF). */
0168c3d8
KH
238
239int
461c2ab9 240multibyte_char_to_unibyte (int c)
0168c3d8 241{
2e5db15c
KH
242 if (c < 0x80)
243 return c;
b672c5ae
KH
244 if (CHAR_BYTE8_P (c))
245 return CHAR_TO_BYTE8 (c);
2e5db15c 246 return (c & 0xFF);
0168c3d8
KH
247}
248
935d5b02
KH
249/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
250 by charset_unibyte. */
251
252int
971de7fb 253multibyte_char_to_unibyte_safe (int c)
935d5b02 254{
2e5db15c
KH
255 if (c < 0x80)
256 return c;
935d5b02
KH
257 if (CHAR_BYTE8_P (c))
258 return CHAR_TO_BYTE8 (c);
2e5db15c 259 return -1;
935d5b02 260}
0168c3d8
KH
261
262DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
bc985c87 263 doc: /* Return non-nil if OBJECT is a character.
4abcdac8
CY
264In Emacs Lisp, characters are represented by character codes, which
265are non-negative integers. The function `max-char' returns the
266maximum character code.
bc985c87 267usage: (characterp OBJECT) */)
5842a27b 268 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
269{
270 return (CHARACTERP (object) ? Qt : Qnil);
271}
272
273DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
274 doc: /* Return the character of the maximum code. */)
5842a27b 275 (void)
0168c3d8
KH
276{
277 return make_number (MAX_CHAR);
278}
279
280DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
281 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 282 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 283 (Lisp_Object ch)
0168c3d8
KH
284{
285 int c;
0168c3d8
KH
286
287 CHECK_CHARACTER (ch);
288 c = XFASTINT (ch);
2e5db15c
KH
289 if (c >= 0x100)
290 error ("Not a unibyte character: %d", c);
4c0354d7 291 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
292 return make_number (c);
293}
294
295DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
296 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
297 doc: /* Convert the multibyte character CH to a byte.
298If the multibyte character does not represent a byte, return -1. */)
5842a27b 299 (Lisp_Object ch)
0168c3d8 300{
5556875b 301 int cm;
0168c3d8
KH
302
303 CHECK_CHARACTER (ch);
5556875b
SM
304 cm = XFASTINT (ch);
305 if (cm < 256)
306 /* Can't distinguish a byte read from a unibyte buffer from
307 a latin1 char, so let's let it slide. */
308 return ch;
309 else
310 {
2afc21f5 311 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
312 return make_number (cu);
313 }
0168c3d8
KH
314}
315
25ed9e61
KH
316
317/* Return width (columns) of C considering the buffer display table DP. */
318
6e6c82a4 319static ptrdiff_t
25ed9e61
KH
320char_width (int c, struct Lisp_Char_Table *dp)
321{
6e6c82a4 322 ptrdiff_t width = CHAR_WIDTH (c);
25ed9e61
KH
323
324 if (dp)
325 {
326 Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
327 int i;
328
329 if (VECTORP (disp))
330 for (i = 0, width = 0; i < ASIZE (disp); i++)
331 {
332 ch = AREF (disp, i);
333 if (CHARACTERP (ch))
df0b2940
PE
334 {
335 int w = CHAR_WIDTH (XFASTINT (ch));
336 if (INT_ADD_OVERFLOW (width, w))
337 string_overflow ();
338 width += w;
339 }
25ed9e61
KH
340 }
341 }
342 return width;
343}
344
345
a7ca3326 346DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
0168c3d8
KH
347 doc: /* Return width of CHAR when displayed in the current buffer.
348The width is measured by how many columns it occupies on the screen.
add553ac
JB
349Tab is taken to occupy `tab-width' columns.
350usage: (char-width CHAR) */)
5842a27b 351 (Lisp_Object ch)
0168c3d8 352{
df0b2940 353 int c;
6e6c82a4 354 ptrdiff_t width;
0168c3d8
KH
355
356 CHECK_CHARACTER (ch);
357 c = XINT (ch);
25ed9e61 358 width = char_width (c, buffer_display_table ());
0168c3d8
KH
359 return make_number (width);
360}
361
0168c3d8
KH
362/* Return width of string STR of length LEN when displayed in the
363 current buffer. The width is measured by how many columns it
364 occupies on the screen. If PRECISION > 0, return the width of
365 longest substring that doesn't exceed PRECISION, and set number of
366 characters and bytes of the substring in *NCHARS and *NBYTES
367 respectively. */
368
d311d28c
PE
369ptrdiff_t
370c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
371 ptrdiff_t *nchars, ptrdiff_t *nbytes)
0168c3d8 372{
d311d28c
PE
373 ptrdiff_t i = 0, i_byte = 0;
374 ptrdiff_t width = 0;
0168c3d8
KH
375 struct Lisp_Char_Table *dp = buffer_display_table ();
376
377 while (i_byte < len)
378 {
25ed9e61 379 int bytes;
62a6e103 380 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
6e6c82a4 381 ptrdiff_t thiswidth = char_width (c, dp);
0168c3d8 382
df0b2940 383 if (precision <= 0)
0168c3d8 384 {
df0b2940
PE
385 if (INT_ADD_OVERFLOW (width, thiswidth))
386 string_overflow ();
0168c3d8 387 }
df0b2940 388 else if (precision - width < thiswidth)
0168c3d8
KH
389 {
390 *nchars = i;
391 *nbytes = i_byte;
392 return width;
393 }
394 i++;
395 i_byte += bytes;
396 width += thiswidth;
397 }
398
399 if (precision > 0)
400 {
401 *nchars = i;
402 *nbytes = i_byte;
403 }
404
405 return width;
406}
407
1889b238
KH
408/* Return width of string STR of length LEN when displayed in the
409 current buffer. The width is measured by how many columns it
410 occupies on the screen. */
411
d311d28c
PE
412ptrdiff_t
413strwidth (const char *str, ptrdiff_t len)
1889b238 414{
7469ef5d 415 return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
1889b238
KH
416}
417
0168c3d8
KH
418/* Return width of Lisp string STRING when displayed in the current
419 buffer. The width is measured by how many columns it occupies on
420 the screen while paying attention to compositions. If PRECISION >
421 0, return the width of longest substring that doesn't exceed
422 PRECISION, and set number of characters and bytes of the substring
423 in *NCHARS and *NBYTES respectively. */
424
d311d28c
PE
425ptrdiff_t
426lisp_string_width (Lisp_Object string, ptrdiff_t precision,
427 ptrdiff_t *nchars, ptrdiff_t *nbytes)
0168c3d8 428{
d311d28c 429 ptrdiff_t len = SCHARS (string);
0aee65b9
KH
430 /* This set multibyte to 0 even if STRING is multibyte when it
431 contains only ascii and eight-bit-graphic, but that's
432 intentional. */
d5172d4f 433 bool multibyte = len < SBYTES (string);
8f924df7 434 unsigned char *str = SDATA (string);
d311d28c
PE
435 ptrdiff_t i = 0, i_byte = 0;
436 ptrdiff_t width = 0;
0168c3d8
KH
437 struct Lisp_Char_Table *dp = buffer_display_table ();
438
439 while (i < len)
440 {
d311d28c 441 ptrdiff_t chars, bytes, thiswidth;
0168c3d8 442 Lisp_Object val;
ebfa62c0 443 ptrdiff_t cmp_id;
d311d28c 444 ptrdiff_t ignore, end;
0168c3d8
KH
445
446 if (find_composition (i, -1, &ignore, &end, &val, string)
447 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
448 >= 0))
449 {
450 thiswidth = composition_table[cmp_id]->width;
451 chars = end - i;
452 bytes = string_char_to_byte (string, end) - i_byte;
453 }
0168c3d8
KH
454 else
455 {
0aee65b9 456 int c;
0168c3d8 457
0aee65b9 458 if (multibyte)
2b4560a8
PE
459 {
460 int cbytes;
461 c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
462 bytes = cbytes;
463 }
0aee65b9
KH
464 else
465 c = str[i_byte], bytes = 1;
0168c3d8 466 chars = 1;
25ed9e61 467 thiswidth = char_width (c, dp);
0168c3d8
KH
468 }
469
2b4560a8
PE
470 if (precision <= 0)
471 {
472#ifdef emacs
473 if (INT_ADD_OVERFLOW (width, thiswidth))
474 string_overflow ();
475#endif
476 }
477 else if (precision - width < thiswidth)
0168c3d8
KH
478 {
479 *nchars = i;
480 *nbytes = i_byte;
481 return width;
482 }
483 i += chars;
484 i_byte += bytes;
485 width += thiswidth;
ef3ff036 486 }
0168c3d8
KH
487
488 if (precision > 0)
489 {
490 *nchars = i;
491 *nbytes = i_byte;
492 }
493
494 return width;
495}
496
497DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
498 doc: /* Return width of STRING when displayed in the current buffer.
499Width is measured by how many columns it occupies on the screen.
500When calculating width of a multibyte character in STRING,
501only the base leading-code is considered; the validity of
502the following bytes is not checked. Tabs in STRING are always
add553ac
JB
503taken to occupy `tab-width' columns.
504usage: (string-width STRING) */)
5842a27b 505 (Lisp_Object str)
0168c3d8
KH
506{
507 Lisp_Object val;
508
509 CHECK_STRING (str);
510 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
511 return val;
512}
513
0168c3d8
KH
514/* Return the number of characters in the NBYTES bytes at PTR.
515 This works by looking at the contents and checking for multibyte
516 sequences while assuming that there's no invalid sequence.
517 However, if the current buffer has enable-multibyte-characters =
518 nil, we treat each byte as a character. */
519
d311d28c
PE
520ptrdiff_t
521chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
0168c3d8
KH
522{
523 /* current_buffer is null at early stages of Emacs initialization. */
524 if (current_buffer == 0
4b4deea2 525 || NILP (BVAR (current_buffer, enable_multibyte_characters)))
0168c3d8
KH
526 return nbytes;
527
528 return multibyte_chars_in_text (ptr, nbytes);
529}
530
531/* Return the number of characters in the NBYTES bytes at PTR.
532 This works by looking at the contents and checking for multibyte
533 sequences while assuming that there's no invalid sequence. It
534 ignores enable-multibyte-characters. */
535
d311d28c
PE
536ptrdiff_t
537multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
0168c3d8 538{
8f924df7 539 const unsigned char *endp = ptr + nbytes;
d311d28c 540 ptrdiff_t chars = 0;
0168c3d8
KH
541
542 while (ptr < endp)
543 {
d311d28c 544 int len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
545
546 if (len == 0)
1088b922 547 emacs_abort ();
0168c3d8
KH
548 ptr += len;
549 chars++;
550 }
551
552 return chars;
553}
554
555/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
556 characters and bytes in it, and store them in *NCHARS and *NBYTES
557 respectively. On counting bytes, pay attention to that 8-bit
558 characters not constructing a valid multibyte sequence are
559 represented by 2-byte in a multibyte text. */
560
561void
d311d28c
PE
562parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
563 ptrdiff_t *nchars, ptrdiff_t *nbytes)
0168c3d8 564{
8f924df7 565 const unsigned char *endp = str + len;
d311d28c
PE
566 int n;
567 ptrdiff_t chars = 0, bytes = 0;
0168c3d8
KH
568
569 if (len >= MAX_MULTIBYTE_LENGTH)
570 {
8f924df7 571 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
572 while (str < adjusted_endp)
573 {
fc9a17bc
KH
574 if (! CHAR_BYTE8_HEAD_P (*str)
575 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
576 str += n, bytes += n;
577 else
578 str++, bytes += 2;
579 chars++;
580 }
581 }
582 while (str < endp)
583 {
fc9a17bc
KH
584 if (! CHAR_BYTE8_HEAD_P (*str)
585 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
586 str += n, bytes += n;
587 else
588 str++, bytes += 2;
589 chars++;
590 }
591
592 *nchars = chars;
593 *nbytes = bytes;
594 return;
595}
596
597/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
da6062e6 598 It actually converts only such 8-bit characters that don't construct
0168c3d8
KH
599 a multibyte sequence to multibyte forms of Latin-1 characters. If
600 NCHARS is nonzero, set *NCHARS to the number of characters in the
601 text. It is assured that we can use LEN bytes at STR as a work
602 area and that is enough. Return the number of bytes of the
603 resulting text. */
604
d311d28c
PE
605ptrdiff_t
606str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
607 ptrdiff_t *nchars)
0168c3d8
KH
608{
609 unsigned char *p = str, *endp = str + nbytes;
610 unsigned char *to;
d311d28c 611 ptrdiff_t chars = 0;
0168c3d8
KH
612 int n;
613
614 if (nbytes >= MAX_MULTIBYTE_LENGTH)
615 {
616 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
617 while (p < adjusted_endp
fc9a17bc 618 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
619 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
620 p += n, chars++;
621 }
fc9a17bc
KH
622 while (p < endp
623 && ! CHAR_BYTE8_HEAD_P (*p)
624 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
625 p += n, chars++;
626 if (nchars)
627 *nchars = chars;
628 if (p == endp)
629 return nbytes;
630
631 to = p;
632 nbytes = endp - p;
633 endp = str + len;
72af86bd 634 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
635 p = endp - nbytes;
636
637 if (nbytes >= MAX_MULTIBYTE_LENGTH)
638 {
639 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
640 while (p < adjusted_endp)
641 {
fc9a17bc
KH
642 if (! CHAR_BYTE8_HEAD_P (*p)
643 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
644 {
645 while (n--)
646 *to++ = *p++;
647 }
648 else
649 {
650 int c = *p++;
651 c = BYTE8_TO_CHAR (c);
652 to += CHAR_STRING (c, to);
653 }
654 }
655 chars++;
656 }
657 while (p < endp)
658 {
fc9a17bc
KH
659 if (! CHAR_BYTE8_HEAD_P (*p)
660 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
661 {
662 while (n--)
663 *to++ = *p++;
8f924df7 664 }
0168c3d8
KH
665 else
666 {
667 int c = *p++;
668 c = BYTE8_TO_CHAR (c);
669 to += CHAR_STRING (c, to);
670 }
671 chars++;
672 }
673 if (nchars)
674 *nchars = chars;
675 return (to - str);
676}
677
678/* Parse unibyte string at STR of LEN bytes, and return the number of
e1dbe924 679 bytes it may occupy when converted to multibyte string by
0168c3d8
KH
680 `str_to_multibyte'. */
681
d311d28c
PE
682ptrdiff_t
683count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
0168c3d8 684{
eec47d6b 685 const unsigned char *endp = str + len;
d311d28c 686 ptrdiff_t bytes;
0168c3d8
KH
687
688 for (bytes = 0; str < endp; str++)
de883a70
PE
689 {
690 int n = *str < 0x80 ? 1 : 2;
691 if (INT_ADD_OVERFLOW (bytes, n))
692 string_overflow ();
693 bytes += n;
694 }
0168c3d8
KH
695 return bytes;
696}
697
698
ef3ff036 699/* Convert unibyte text at STR of BYTES bytes to a multibyte text
0168c3d8
KH
700 that contains the same single-byte characters. It actually
701 converts all 8-bit characters to multibyte forms. It is assured
702 that we can use LEN bytes at STR as a work area and that is
703 enough. */
704
d311d28c
PE
705ptrdiff_t
706str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
0168c3d8
KH
707{
708 unsigned char *p = str, *endp = str + bytes;
709 unsigned char *to;
710
711 while (p < endp && *p < 0x80) p++;
712 if (p == endp)
713 return bytes;
714 to = p;
715 bytes = endp - p;
716 endp = str + len;
72af86bd 717 memmove (endp - bytes, p, bytes);
0168c3d8 718 p = endp - bytes;
8f924df7 719 while (p < endp)
0168c3d8
KH
720 {
721 int c = *p++;
722
723 if (c >= 0x80)
724 c = BYTE8_TO_CHAR (c);
725 to += CHAR_STRING (c, to);
726 }
727 return (to - str);
728}
729
730/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
731 actually converts characters in the range 0x80..0xFF to
732 unibyte. */
733
d311d28c
PE
734ptrdiff_t
735str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
0168c3d8 736{
15843e6f
KH
737 const unsigned char *p = str, *endp = str + bytes;
738 unsigned char *to;
0168c3d8
KH
739 int c, len;
740
741 while (p < endp)
742 {
743 c = *p;
744 len = BYTES_BY_CHAR_HEAD (c);
745 if (CHAR_BYTE8_HEAD_P (c))
746 break;
747 p += len;
748 }
15843e6f 749 to = str + (p - str);
8f924df7 750 while (p < endp)
0168c3d8
KH
751 {
752 c = *p;
753 len = BYTES_BY_CHAR_HEAD (c);
754 if (CHAR_BYTE8_HEAD_P (c))
755 {
756 c = STRING_CHAR_ADVANCE (p);
757 *to++ = CHAR_TO_BYTE8 (c);
758 }
759 else
760 {
761 while (len--) *to++ = *p++;
762 }
763 }
764 return (to - str);
765}
766
4aa40bb8
KH
767/* Convert eight-bit chars in SRC (in multibyte form) to the
768 corresponding byte and store in DST. CHARS is the number of
769 characters in SRC. The value is the number of bytes stored in DST.
770 Usually, the value is the same as CHARS, but is less than it if SRC
d5172d4f 771 contains a non-ASCII, non-eight-bit character. */
4aa40bb8 772
d311d28c 773ptrdiff_t
d5172d4f 774str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
4aa40bb8 775{
d311d28c 776 ptrdiff_t i;
4aa40bb8
KH
777
778 for (i = 0; i < chars; i++)
779 {
780 int c = STRING_CHAR_ADVANCE (src);
781
782 if (CHAR_BYTE8_P (c))
783 c = CHAR_TO_BYTE8 (c);
d5172d4f 784 else if (! ASCII_CHAR_P (c))
4aa40bb8
KH
785 return i;
786 *dst++ = c;
787 }
788 return i;
789}
790
791
d311d28c 792static ptrdiff_t
971de7fb 793string_count_byte8 (Lisp_Object string)
0168c3d8 794{
d5172d4f 795 bool multibyte = STRING_MULTIBYTE (string);
d311d28c 796 ptrdiff_t nbytes = SBYTES (string);
8f924df7 797 unsigned char *p = SDATA (string);
0168c3d8 798 unsigned char *pend = p + nbytes;
d311d28c 799 ptrdiff_t count = 0;
0168c3d8
KH
800 int c, len;
801
802 if (multibyte)
803 while (p < pend)
804 {
805 c = *p;
806 len = BYTES_BY_CHAR_HEAD (c);
807
808 if (CHAR_BYTE8_HEAD_P (c))
809 count++;
810 p += len;
811 }
812 else
813 while (p < pend)
814 {
815 if (*p++ >= 0x80)
816 count++;
817 }
818 return count;
819}
820
821
822Lisp_Object
971de7fb 823string_escape_byte8 (Lisp_Object string)
0168c3d8 824{
d311d28c
PE
825 ptrdiff_t nchars = SCHARS (string);
826 ptrdiff_t nbytes = SBYTES (string);
d5172d4f 827 bool multibyte = STRING_MULTIBYTE (string);
d311d28c 828 ptrdiff_t byte8_count;
15843e6f
KH
829 const unsigned char *src, *src_end;
830 unsigned char *dst;
0168c3d8
KH
831 Lisp_Object val;
832 int c, len;
833
834 if (multibyte && nchars == nbytes)
835 return string;
836
837 byte8_count = string_count_byte8 (string);
838
839 if (byte8_count == 0)
840 return string;
841
842 if (multibyte)
14162469
EZ
843 {
844 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
c9d624c6 845 || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
cb93f9be 846 string_overflow ();
14162469
EZ
847
848 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
849 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
850 nbytes + byte8_count * 2);
851 }
0168c3d8 852 else
14162469 853 {
c9d624c6 854 if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
cb93f9be 855 string_overflow ();
ef3ff036 856
14162469
EZ
857 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
858 val = make_uninit_string (nbytes + byte8_count * 3);
859 }
0168c3d8 860
8f924df7 861 src = SDATA (string);
0168c3d8 862 src_end = src + nbytes;
8f924df7 863 dst = SDATA (val);
0168c3d8
KH
864 if (multibyte)
865 while (src < src_end)
866 {
867 c = *src;
868 len = BYTES_BY_CHAR_HEAD (c);
869
870 if (CHAR_BYTE8_HEAD_P (c))
871 {
872 c = STRING_CHAR_ADVANCE (src);
873 c = CHAR_TO_BYTE8 (c);
99027bdd 874 dst += sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
875 }
876 else
877 while (len--) *dst++ = *src++;
878 }
879 else
880 while (src < src_end)
881 {
882 c = *src++;
883 if (c >= 0x80)
99027bdd 884 dst += sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
885 else
886 *dst++ = c;
887 }
888 return val;
889}
890
891\f
a7ca3326 892DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 893 doc: /*
d2e83296
DL
894Concatenate all the argument characters and make the result a string.
895usage: (string &rest CHARACTERS) */)
f66c7cf8 896 (ptrdiff_t n, Lisp_Object *args)
0168c3d8 897{
f66c7cf8 898 ptrdiff_t i;
c5101a77 899 int c;
754790b6
CY
900 unsigned char *buf, *p;
901 Lisp_Object str;
902 USE_SAFE_ALLOCA;
903
0065d054 904 SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
754790b6 905 p = buf;
0168c3d8
KH
906
907 for (i = 0; i < n; i++)
908 {
909 CHECK_CHARACTER (args[i]);
910 c = XINT (args[i]);
911 p += CHAR_STRING (c, p);
912 }
913
754790b6
CY
914 str = make_string_from_bytes ((char *) buf, n, p - buf);
915 SAFE_FREE ();
916 return str;
0168c3d8
KH
917}
918
70b4969d 919DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
920 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
921usage: (unibyte-string &rest BYTES) */)
f66c7cf8 922 (ptrdiff_t n, Lisp_Object *args)
70b4969d 923{
f66c7cf8 924 ptrdiff_t i;
754790b6
CY
925 Lisp_Object str;
926 USE_SAFE_ALLOCA;
98c6f1e3
PE
927 unsigned char *buf = SAFE_ALLOCA (n);
928 unsigned char *p = buf;
70b4969d
KH
929
930 for (i = 0; i < n; i++)
931 {
af5a5a98 932 CHECK_RANGED_INTEGER (args[i], 0, 255);
a14e1568 933 *p++ = XINT (args[i]);
70b4969d
KH
934 }
935
754790b6
CY
936 str = make_string_from_bytes ((char *) buf, n, p - buf);
937 SAFE_FREE ();
938 return str;
70b4969d
KH
939}
940
c73ae4ae 941DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
942 Schar_resolve_modifiers, 1, 1, 0,
943 doc: /* Resolve modifiers in the character CHAR.
944The value is a character with modifiers resolved into the character
945code. Unresolved modifiers are kept in the value.
c73ae4ae 946usage: (char-resolve-modifiers CHAR) */)
5842a27b 947 (Lisp_Object character)
d0363d44 948{
d311d28c 949 EMACS_INT c;
d0363d44
KH
950
951 CHECK_NUMBER (character);
952 c = XINT (character);
953 return make_number (char_resolve_modifier_mask (c));
954}
955
ee107a89
KH
956DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
957 doc: /* Return a byte value of a character at point.
958Optional 1st arg POSITION, if non-nil, is a position of a character to get
959a byte value.
960Optional 2nd arg STRING, if non-nil, is a string of which first
961character is a target to get a byte value. In this case, POSITION, if
962non-nil, is an index of a target character in the string.
963
964If the current buffer (or STRING) is multibyte, and the target
8350f087 965character is not ASCII nor 8-bit character, an error is signaled. */)
5842a27b 966 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
967{
968 int c;
d311d28c 969 ptrdiff_t pos;
ee107a89
KH
970 unsigned char *p;
971
972 if (NILP (string))
973 {
974 if (NILP (position))
975 {
976 p = PT_ADDR;
4d8e170e 977 }
ee107a89
KH
978 else
979 {
980 CHECK_NUMBER_COERCE_MARKER (position);
981 if (XINT (position) < BEGV || XINT (position) >= ZV)
982 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
983 pos = XFASTINT (position);
984 p = CHAR_POS_ADDR (pos);
985 }
4b4deea2 986 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
d5998e03 987 return make_number (*p);
ee107a89
KH
988 }
989 else
990 {
991 CHECK_STRING (string);
992 if (NILP (position))
993 {
994 p = SDATA (string);
995 }
996 else
997 {
998 CHECK_NATNUM (position);
999 if (XINT (position) >= SCHARS (string))
1000 args_out_of_range (string, position);
1001 pos = XFASTINT (position);
1002 p = SDATA (string) + string_char_to_byte (string, pos);
1003 }
d5998e03
KH
1004 if (! STRING_MULTIBYTE (string))
1005 return make_number (*p);
ee107a89 1006 }
62a6e103 1007 c = STRING_CHAR (p);
ee107a89
KH
1008 if (CHAR_BYTE8_P (c))
1009 c = CHAR_TO_BYTE8 (c);
1010 else if (! ASCII_CHAR_P (c))
1011 error ("Not an ASCII nor an 8-bit character: %d", c);
1012 return make_number (c);
1013}
1014
0168c3d8
KH
1015#ifdef emacs
1016
1017void
971de7fb 1018syms_of_character (void)
0168c3d8
KH
1019{
1020 DEFSYM (Qcharacterp, "characterp");
1021 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1022
1023 staticpro (&Vchar_unify_table);
1024 Vchar_unify_table = Qnil;
1025
1026 defsubr (&Smax_char);
1027 defsubr (&Scharacterp);
1028 defsubr (&Sunibyte_char_to_multibyte);
1029 defsubr (&Smultibyte_char_to_unibyte);
0168c3d8
KH
1030 defsubr (&Schar_width);
1031 defsubr (&Sstring_width);
0168c3d8 1032 defsubr (&Sstring);
70b4969d 1033 defsubr (&Sunibyte_string);
d0363d44 1034 defsubr (&Schar_resolve_modifiers);
ee107a89 1035 defsubr (&Sget_byte);
0168c3d8 1036
29208e82 1037 DEFVAR_LISP ("translation-table-vector", Vtranslation_table_vector,
0168c3d8 1038 doc: /*
68978cf0
DL
1039Vector recording all translation tables ever defined.
1040Each element is a pair (SYMBOL . TABLE) relating the table to the
1041symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1042 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1043
29208e82 1044 DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
0168c3d8
KH
1045 doc: /*
1046A char-table for characters which invoke auto-filling.
1047Such characters have value t in this table. */);
1048 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1049 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1050 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8 1051
29208e82 1052 DEFVAR_LISP ("char-width-table", Vchar_width_table,
0168c3d8
KH
1053 doc: /*
1054A char-table for width (columns) of each character. */);
1055 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1056 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1057 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1058 make_number (4));
0168c3d8 1059
29208e82 1060 DEFVAR_LISP ("printable-chars", Vprintable_chars,
0168c3d8 1061 doc: /* A char-table for each printable character. */);
db6d4189 1062 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1063 Fset_char_table_range (Vprintable_chars,
1064 Fcons (make_number (32), make_number (126)), Qt);
1065 Fset_char_table_range (Vprintable_chars,
1066 Fcons (make_number (160),
1067 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1068
29208e82 1069 DEFVAR_LISP ("char-script-table", Vchar_script_table,
c57f3328
KH
1070 doc: /* Char table of script symbols.
1071It has one extra slot whose value is a list of script symbols. */);
1072
c57f3328
KH
1073 DEFSYM (Qchar_script_table, "char-script-table");
1074 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1075 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352 1076
29208e82 1077 DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
c3bb7671 1078 doc: /* Alist of scripts vs the representative characters.
f4427a54 1079Each element is a cons (SCRIPT . CHARS).
4d8e170e 1080SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1081CHARS is a list or a vector of characters.
472a4dc9 1082If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1083If it is a vector, one of the characters in the vector is necessary.
1084This variable is used to find a font for a specific script. */);
c7e14352 1085 Vscript_representative_chars = Qnil;
a3cbb631 1086
29208e82 1087 DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
a3cbb631 1088 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1089All Unicode characters have one of the following values (symbol):
1090 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1091 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1092See The Unicode Standard for the meaning of those values. */);
1093 /* The correct char-table is setup in characters.el. */
1094 Vunicode_category_table = Qnil;
0168c3d8
KH
1095}
1096
1097#endif /* emacs */