Adjust in response to jan.h.d's comments.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8 1/* Basic character support.
73b0cd50
GM
2
3Copyright (C) 2001-2011 Free Software Foundation, Inc.
4Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
0168c3d8
KH
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
2b4560a8 38#include <intprops.h>
0168c3d8
KH
39#include "lisp.h"
40#include "character.h"
41#include "buffer.h"
42#include "charset.h"
43#include "composite.h"
44#include "disptab.h"
45
46#else /* not emacs */
47
48#include "mulelib.h"
49
50#endif /* emacs */
51
52Lisp_Object Qcharacterp;
53
955cbe7b 54static Lisp_Object Qauto_fill_chars;
0168c3d8 55
33f91981 56/* Char-table of information about which character to unify to which
6f1b43a0 57 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
58Lisp_Object Vchar_unify_table;
59
8973478b 60/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 61unsigned char *_fetch_multibyte_char_p;
0168c3d8 62
c57f3328
KH
63static Lisp_Object Qchar_script_table;
64
0168c3d8
KH
65\f
66
2bde7652
KH
67/* If character code C has modifier masks, reflect them to the
68 character code if possible. Return the resulting code. */
69
70int
971de7fb 71char_resolve_modifier_mask (int c)
2bde7652 72{
d0363d44 73 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
74 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
75 return c;
76
77 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
78 if (c & CHAR_SHIFT)
79 {
80 /* Shift modifier is valid only with [A-Za-z]. */
81 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
82 c &= ~CHAR_SHIFT;
83 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
84 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
85 /* Shift modifier for control characters and SPC is ignored. */
86 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
87 c &= ~CHAR_SHIFT;
88 }
2bde7652
KH
89 if (c & CHAR_CTL)
90 {
91 /* Simulate the code in lread.c. */
92 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
93 if ((c & 0377) == ' ')
94 c &= ~0177 & ~ CHAR_CTL;
95 else if ((c & 0377) == '?')
96 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
97 /* ASCII control chars are made from letters (both cases),
98 as well as the non-letters within 0100...0137. */
99 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
100 c &= (037 | (~0177 & ~CHAR_CTL));
101 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
102 c &= (037 | (~0177 & ~CHAR_CTL));
103 }
e9c1637d 104#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
105 if (c & CHAR_META)
106 {
107 /* Move the meta bit to the right place for a string. */
108 c = (c & ~CHAR_META) | 0x80;
109 }
e9c1637d 110#endif
2bde7652
KH
111
112 return c;
113}
114
115
33f91981
KH
116/* Store multibyte form of character C at P. If C has modifier bits,
117 handle them appropriately. */
118
0168c3d8 119int
971de7fb 120char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
121{
122 int bytes;
123
e3d8eb8c
KH
124 if (c & CHAR_MODIFIER_MASK)
125 {
c5958d4c 126 c = char_resolve_modifier_mask (c);
e3d8eb8c
KH
127 /* If C still has any modifier bits, just ignore it. */
128 c &= ~CHAR_MODIFIER_MASK;
129 }
130
0168c3d8
KH
131 MAYBE_UNIFY_CHAR (c);
132
e3d8eb8c 133 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
134 {
135 bytes = CHAR_STRING (c, p);
136 }
137 else if (c <= MAX_4_BYTE_CHAR)
138 {
139 p[0] = (0xF0 | (c >> 18));
140 p[1] = (0x80 | ((c >> 12) & 0x3F));
141 p[2] = (0x80 | ((c >> 6) & 0x3F));
142 p[3] = (0x80 | (c & 0x3F));
143 bytes = 4;
144 }
e3d8eb8c 145 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
146 {
147 p[0] = 0xF8;
148 p[1] = (0x80 | ((c >> 18) & 0x0F));
149 p[2] = (0x80 | ((c >> 12) & 0x3F));
150 p[3] = (0x80 | ((c >> 6) & 0x3F));
151 p[4] = (0x80 | (c & 0x3F));
152 bytes = 5;
153 }
5aa91c9b 154 else if (c <= MAX_CHAR)
e3d8eb8c
KH
155 {
156 c = CHAR_TO_BYTE8 (c);
157 bytes = BYTE8_STRING (c, p);
158 }
5aa91c9b 159 else
e6c3da20 160 error ("Invalid character: %x", c);
1889b238 161
0168c3d8
KH
162 return bytes;
163}
164
165
224a3131 166/* Return a character whose multibyte form is at P. If LEN is not
33f91981 167 NULL, it must be a pointer to integer. In that case, set *LEN to
224a3131 168 the byte length of the multibyte form. If ADVANCED is not NULL, it
33f91981 169 must be a pointer to unsigned char. In that case, set *ADVANCED to
224a3131 170 the ending address (i.e., the starting address of the next
33f91981
KH
171 character) of the multibyte form. */
172
0168c3d8 173int
971de7fb 174string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 175{
1889b238 176 int c;
15843e6f 177 const unsigned char *saved_p = p;
0168c3d8
KH
178
179 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
180 {
181 c = STRING_CHAR_ADVANCE (p);
182 }
183 else if (! (*p & 0x08))
184 {
185 c = ((((p)[0] & 0xF) << 18)
186 | (((p)[1] & 0x3F) << 12)
187 | (((p)[2] & 0x3F) << 6)
188 | ((p)[3] & 0x3F));
189 p += 4;
190 }
191 else
192 {
193 c = ((((p)[1] & 0x3F) << 18)
194 | (((p)[2] & 0x3F) << 12)
195 | (((p)[3] & 0x3F) << 6)
196 | ((p)[4] & 0x3F));
197 p += 5;
198 }
199
200 MAYBE_UNIFY_CHAR (c);
201
202 if (len)
203 *len = p - saved_p;
204 if (advanced)
205 *advanced = p;
206 return c;
207}
208
209
224a3131
EZ
210/* Translate character C by translation table TABLE. If no translation is
211 found in TABLE, return the untranslated character. If TABLE is a list,
212 elements are char tables. In that case, recursively translate C by all the
213 tables in the list. */
0168c3d8
KH
214
215int
971de7fb 216translate_char (Lisp_Object table, int c)
0168c3d8 217{
10453be9
KH
218 if (CHAR_TABLE_P (table))
219 {
220 Lisp_Object ch;
221
222 ch = CHAR_TABLE_REF (table, c);
223 if (CHARACTERP (ch))
224 c = XINT (ch);
225 }
226 else
227 {
228 for (; CONSP (table); table = XCDR (table))
229 c = translate_char (XCAR (table), c);
230 }
231 return c;
0168c3d8
KH
232}
233
2e5db15c 234/* Convert ASCII or 8-bit character C to unibyte. If C is none of
461c2ab9 235 them, return (C & 0xFF). */
0168c3d8
KH
236
237int
461c2ab9 238multibyte_char_to_unibyte (int c)
0168c3d8 239{
2e5db15c
KH
240 if (c < 0x80)
241 return c;
b672c5ae
KH
242 if (CHAR_BYTE8_P (c))
243 return CHAR_TO_BYTE8 (c);
2e5db15c 244 return (c & 0xFF);
0168c3d8
KH
245}
246
935d5b02
KH
247/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
248 by charset_unibyte. */
249
250int
971de7fb 251multibyte_char_to_unibyte_safe (int c)
935d5b02 252{
2e5db15c
KH
253 if (c < 0x80)
254 return c;
935d5b02
KH
255 if (CHAR_BYTE8_P (c))
256 return CHAR_TO_BYTE8 (c);
2e5db15c 257 return -1;
935d5b02 258}
0168c3d8
KH
259
260DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
bc985c87
AS
261 doc: /* Return non-nil if OBJECT is a character.
262usage: (characterp OBJECT) */)
5842a27b 263 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
264{
265 return (CHARACTERP (object) ? Qt : Qnil);
266}
267
268DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
269 doc: /* Return the character of the maximum code. */)
5842a27b 270 (void)
0168c3d8
KH
271{
272 return make_number (MAX_CHAR);
273}
274
275DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
276 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 277 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 278 (Lisp_Object ch)
0168c3d8
KH
279{
280 int c;
0168c3d8
KH
281
282 CHECK_CHARACTER (ch);
283 c = XFASTINT (ch);
2e5db15c
KH
284 if (c >= 0x100)
285 error ("Not a unibyte character: %d", c);
4c0354d7 286 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
287 return make_number (c);
288}
289
290DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
291 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
292 doc: /* Convert the multibyte character CH to a byte.
293If the multibyte character does not represent a byte, return -1. */)
5842a27b 294 (Lisp_Object ch)
0168c3d8 295{
5556875b 296 int cm;
0168c3d8
KH
297
298 CHECK_CHARACTER (ch);
5556875b
SM
299 cm = XFASTINT (ch);
300 if (cm < 256)
301 /* Can't distinguish a byte read from a unibyte buffer from
302 a latin1 char, so let's let it slide. */
303 return ch;
304 else
305 {
2afc21f5 306 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
307 return make_number (cu);
308 }
0168c3d8
KH
309}
310
a7ca3326 311DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
0168c3d8
KH
312 doc: /* Return width of CHAR when displayed in the current buffer.
313The width is measured by how many columns it occupies on the screen.
add553ac
JB
314Tab is taken to occupy `tab-width' columns.
315usage: (char-width CHAR) */)
5842a27b 316 (Lisp_Object ch)
0168c3d8
KH
317{
318 Lisp_Object disp;
319 int c, width;
320 struct Lisp_Char_Table *dp = buffer_display_table ();
321
322 CHECK_CHARACTER (ch);
323 c = XINT (ch);
324
325 /* Get the way the display table would display it. */
326 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
327
328 if (VECTORP (disp))
5637687f 329 width = sanitize_char_width (ASIZE (disp));
0168c3d8
KH
330 else
331 width = CHAR_WIDTH (c);
332
333 return make_number (width);
334}
335
0168c3d8
KH
336/* Return width of string STR of length LEN when displayed in the
337 current buffer. The width is measured by how many columns it
338 occupies on the screen. If PRECISION > 0, return the width of
339 longest substring that doesn't exceed PRECISION, and set number of
340 characters and bytes of the substring in *NCHARS and *NBYTES
341 respectively. */
342
579c18d0
EZ
343EMACS_INT
344c_string_width (const unsigned char *str, EMACS_INT len, int precision,
345 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 346{
579c18d0
EZ
347 EMACS_INT i = 0, i_byte = 0;
348 EMACS_INT width = 0;
0168c3d8
KH
349 struct Lisp_Char_Table *dp = buffer_display_table ();
350
351 while (i_byte < len)
352 {
353 int bytes, thiswidth;
354 Lisp_Object val;
62a6e103 355 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
356
357 if (dp)
358 {
359 val = DISP_CHAR_VECTOR (dp, c);
360 if (VECTORP (val))
5637687f 361 thiswidth = sanitize_char_width (ASIZE (val));
0168c3d8
KH
362 else
363 thiswidth = CHAR_WIDTH (c);
364 }
365 else
366 {
367 thiswidth = CHAR_WIDTH (c);
368 }
369
370 if (precision > 0
371 && (width + thiswidth > precision))
372 {
373 *nchars = i;
374 *nbytes = i_byte;
375 return width;
376 }
377 i++;
378 i_byte += bytes;
379 width += thiswidth;
380 }
381
382 if (precision > 0)
383 {
384 *nchars = i;
385 *nbytes = i_byte;
386 }
387
388 return width;
389}
390
1889b238
KH
391/* Return width of string STR of length LEN when displayed in the
392 current buffer. The width is measured by how many columns it
393 occupies on the screen. */
394
579c18d0 395EMACS_INT
7469ef5d 396strwidth (const char *str, EMACS_INT len)
1889b238 397{
7469ef5d 398 return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
1889b238
KH
399}
400
0168c3d8
KH
401/* Return width of Lisp string STRING when displayed in the current
402 buffer. The width is measured by how many columns it occupies on
403 the screen while paying attention to compositions. If PRECISION >
404 0, return the width of longest substring that doesn't exceed
405 PRECISION, and set number of characters and bytes of the substring
406 in *NCHARS and *NBYTES respectively. */
407
579c18d0 408EMACS_INT
2b4560a8 409lisp_string_width (Lisp_Object string, EMACS_INT precision,
579c18d0 410 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 411{
579c18d0 412 EMACS_INT len = SCHARS (string);
0aee65b9
KH
413 /* This set multibyte to 0 even if STRING is multibyte when it
414 contains only ascii and eight-bit-graphic, but that's
415 intentional. */
416 int multibyte = len < SBYTES (string);
8f924df7 417 unsigned char *str = SDATA (string);
579c18d0
EZ
418 EMACS_INT i = 0, i_byte = 0;
419 EMACS_INT width = 0;
0168c3d8
KH
420 struct Lisp_Char_Table *dp = buffer_display_table ();
421
422 while (i < len)
423 {
2b4560a8 424 EMACS_INT chars, bytes, thiswidth;
0168c3d8 425 Lisp_Object val;
ebfa62c0 426 ptrdiff_t cmp_id;
f4bc0685 427 EMACS_INT ignore, end;
0168c3d8
KH
428
429 if (find_composition (i, -1, &ignore, &end, &val, string)
430 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
431 >= 0))
432 {
433 thiswidth = composition_table[cmp_id]->width;
434 chars = end - i;
435 bytes = string_char_to_byte (string, end) - i_byte;
436 }
0168c3d8
KH
437 else
438 {
0aee65b9 439 int c;
0168c3d8 440
0aee65b9 441 if (multibyte)
2b4560a8
PE
442 {
443 int cbytes;
444 c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
445 bytes = cbytes;
446 }
0aee65b9
KH
447 else
448 c = str[i_byte], bytes = 1;
0168c3d8 449 chars = 1;
0aee65b9
KH
450 if (dp)
451 {
452 val = DISP_CHAR_VECTOR (dp, c);
453 if (VECTORP (val))
5637687f 454 thiswidth = sanitize_char_width (ASIZE (val));
0aee65b9
KH
455 else
456 thiswidth = CHAR_WIDTH (c);
457 }
458 else
459 {
460 thiswidth = CHAR_WIDTH (c);
461 }
0168c3d8
KH
462 }
463
2b4560a8
PE
464 if (precision <= 0)
465 {
466#ifdef emacs
467 if (INT_ADD_OVERFLOW (width, thiswidth))
468 string_overflow ();
469#endif
470 }
471 else if (precision - width < thiswidth)
0168c3d8
KH
472 {
473 *nchars = i;
474 *nbytes = i_byte;
475 return width;
476 }
477 i += chars;
478 i_byte += bytes;
479 width += thiswidth;
ef3ff036 480 }
0168c3d8
KH
481
482 if (precision > 0)
483 {
484 *nchars = i;
485 *nbytes = i_byte;
486 }
487
488 return width;
489}
490
491DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
492 doc: /* Return width of STRING when displayed in the current buffer.
493Width is measured by how many columns it occupies on the screen.
494When calculating width of a multibyte character in STRING,
495only the base leading-code is considered; the validity of
496the following bytes is not checked. Tabs in STRING are always
add553ac
JB
497taken to occupy `tab-width' columns.
498usage: (string-width STRING) */)
5842a27b 499 (Lisp_Object str)
0168c3d8
KH
500{
501 Lisp_Object val;
502
503 CHECK_STRING (str);
504 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
505 return val;
506}
507
0168c3d8
KH
508/* Return the number of characters in the NBYTES bytes at PTR.
509 This works by looking at the contents and checking for multibyte
510 sequences while assuming that there's no invalid sequence.
511 However, if the current buffer has enable-multibyte-characters =
512 nil, we treat each byte as a character. */
513
13818c30 514EMACS_INT
971de7fb 515chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8
KH
516{
517 /* current_buffer is null at early stages of Emacs initialization. */
518 if (current_buffer == 0
4b4deea2 519 || NILP (BVAR (current_buffer, enable_multibyte_characters)))
0168c3d8
KH
520 return nbytes;
521
522 return multibyte_chars_in_text (ptr, nbytes);
523}
524
525/* Return the number of characters in the NBYTES bytes at PTR.
526 This works by looking at the contents and checking for multibyte
527 sequences while assuming that there's no invalid sequence. It
528 ignores enable-multibyte-characters. */
529
13818c30 530EMACS_INT
971de7fb 531multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8 532{
8f924df7 533 const unsigned char *endp = ptr + nbytes;
579c18d0 534 EMACS_INT chars = 0;
0168c3d8
KH
535
536 while (ptr < endp)
537 {
579c18d0 538 EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
539
540 if (len == 0)
541 abort ();
542 ptr += len;
543 chars++;
544 }
545
546 return chars;
547}
548
549/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
550 characters and bytes in it, and store them in *NCHARS and *NBYTES
551 respectively. On counting bytes, pay attention to that 8-bit
552 characters not constructing a valid multibyte sequence are
553 represented by 2-byte in a multibyte text. */
554
555void
14162469
EZ
556parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
557 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 558{
8f924df7 559 const unsigned char *endp = str + len;
14162469 560 EMACS_INT n, chars = 0, bytes = 0;
0168c3d8
KH
561
562 if (len >= MAX_MULTIBYTE_LENGTH)
563 {
8f924df7 564 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
565 while (str < adjusted_endp)
566 {
fc9a17bc
KH
567 if (! CHAR_BYTE8_HEAD_P (*str)
568 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
569 str += n, bytes += n;
570 else
571 str++, bytes += 2;
572 chars++;
573 }
574 }
575 while (str < endp)
576 {
fc9a17bc
KH
577 if (! CHAR_BYTE8_HEAD_P (*str)
578 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
579 str += n, bytes += n;
580 else
581 str++, bytes += 2;
582 chars++;
583 }
584
585 *nchars = chars;
586 *nbytes = bytes;
587 return;
588}
589
590/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
591 It actually converts only such 8-bit characters that don't contruct
592 a multibyte sequence to multibyte forms of Latin-1 characters. If
593 NCHARS is nonzero, set *NCHARS to the number of characters in the
594 text. It is assured that we can use LEN bytes at STR as a work
595 area and that is enough. Return the number of bytes of the
596 resulting text. */
597
14162469
EZ
598EMACS_INT
599str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
600 EMACS_INT *nchars)
0168c3d8
KH
601{
602 unsigned char *p = str, *endp = str + nbytes;
603 unsigned char *to;
14162469 604 EMACS_INT chars = 0;
0168c3d8
KH
605 int n;
606
607 if (nbytes >= MAX_MULTIBYTE_LENGTH)
608 {
609 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
610 while (p < adjusted_endp
fc9a17bc 611 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
612 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
613 p += n, chars++;
614 }
fc9a17bc
KH
615 while (p < endp
616 && ! CHAR_BYTE8_HEAD_P (*p)
617 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
618 p += n, chars++;
619 if (nchars)
620 *nchars = chars;
621 if (p == endp)
622 return nbytes;
623
624 to = p;
625 nbytes = endp - p;
626 endp = str + len;
72af86bd 627 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
628 p = endp - nbytes;
629
630 if (nbytes >= MAX_MULTIBYTE_LENGTH)
631 {
632 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
633 while (p < adjusted_endp)
634 {
fc9a17bc
KH
635 if (! CHAR_BYTE8_HEAD_P (*p)
636 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
637 {
638 while (n--)
639 *to++ = *p++;
640 }
641 else
642 {
643 int c = *p++;
644 c = BYTE8_TO_CHAR (c);
645 to += CHAR_STRING (c, to);
646 }
647 }
648 chars++;
649 }
650 while (p < endp)
651 {
fc9a17bc
KH
652 if (! CHAR_BYTE8_HEAD_P (*p)
653 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
654 {
655 while (n--)
656 *to++ = *p++;
8f924df7 657 }
0168c3d8
KH
658 else
659 {
660 int c = *p++;
661 c = BYTE8_TO_CHAR (c);
662 to += CHAR_STRING (c, to);
663 }
664 chars++;
665 }
666 if (nchars)
667 *nchars = chars;
668 return (to - str);
669}
670
671/* Parse unibyte string at STR of LEN bytes, and return the number of
672 bytes it may ocupy when converted to multibyte string by
673 `str_to_multibyte'. */
674
14162469 675EMACS_INT
de883a70 676count_size_as_multibyte (const unsigned char *str, EMACS_INT len)
0168c3d8 677{
eec47d6b 678 const unsigned char *endp = str + len;
14162469 679 EMACS_INT bytes;
0168c3d8
KH
680
681 for (bytes = 0; str < endp; str++)
de883a70
PE
682 {
683 int n = *str < 0x80 ? 1 : 2;
684 if (INT_ADD_OVERFLOW (bytes, n))
685 string_overflow ();
686 bytes += n;
687 }
0168c3d8
KH
688 return bytes;
689}
690
691
ef3ff036 692/* Convert unibyte text at STR of BYTES bytes to a multibyte text
0168c3d8
KH
693 that contains the same single-byte characters. It actually
694 converts all 8-bit characters to multibyte forms. It is assured
695 that we can use LEN bytes at STR as a work area and that is
696 enough. */
697
14162469
EZ
698EMACS_INT
699str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
0168c3d8
KH
700{
701 unsigned char *p = str, *endp = str + bytes;
702 unsigned char *to;
703
704 while (p < endp && *p < 0x80) p++;
705 if (p == endp)
706 return bytes;
707 to = p;
708 bytes = endp - p;
709 endp = str + len;
72af86bd 710 memmove (endp - bytes, p, bytes);
0168c3d8 711 p = endp - bytes;
8f924df7 712 while (p < endp)
0168c3d8
KH
713 {
714 int c = *p++;
715
716 if (c >= 0x80)
717 c = BYTE8_TO_CHAR (c);
718 to += CHAR_STRING (c, to);
719 }
720 return (to - str);
721}
722
723/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
724 actually converts characters in the range 0x80..0xFF to
725 unibyte. */
726
14162469
EZ
727EMACS_INT
728str_as_unibyte (unsigned char *str, EMACS_INT bytes)
0168c3d8 729{
15843e6f
KH
730 const unsigned char *p = str, *endp = str + bytes;
731 unsigned char *to;
0168c3d8
KH
732 int c, len;
733
734 while (p < endp)
735 {
736 c = *p;
737 len = BYTES_BY_CHAR_HEAD (c);
738 if (CHAR_BYTE8_HEAD_P (c))
739 break;
740 p += len;
741 }
15843e6f 742 to = str + (p - str);
8f924df7 743 while (p < endp)
0168c3d8
KH
744 {
745 c = *p;
746 len = BYTES_BY_CHAR_HEAD (c);
747 if (CHAR_BYTE8_HEAD_P (c))
748 {
749 c = STRING_CHAR_ADVANCE (p);
750 *to++ = CHAR_TO_BYTE8 (c);
751 }
752 else
753 {
754 while (len--) *to++ = *p++;
755 }
756 }
757 return (to - str);
758}
759
4aa40bb8
KH
760/* Convert eight-bit chars in SRC (in multibyte form) to the
761 corresponding byte and store in DST. CHARS is the number of
762 characters in SRC. The value is the number of bytes stored in DST.
763 Usually, the value is the same as CHARS, but is less than it if SRC
8307f923 764 contains a non-ASCII, non-eight-bit character. If ACCEPT_LATIN_1
4aa40bb8 765 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
766 of that character code.
767 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
768
769EMACS_INT
971de7fb 770str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
4aa40bb8
KH
771{
772 EMACS_INT i;
773
774 for (i = 0; i < chars; i++)
775 {
776 int c = STRING_CHAR_ADVANCE (src);
777
778 if (CHAR_BYTE8_P (c))
779 c = CHAR_TO_BYTE8 (c);
780 else if (! ASCII_CHAR_P (c)
781 && (! accept_latin_1 || c >= 0x100))
782 return i;
783 *dst++ = c;
784 }
785 return i;
786}
787
788
d0891610 789static EMACS_INT
971de7fb 790string_count_byte8 (Lisp_Object string)
0168c3d8
KH
791{
792 int multibyte = STRING_MULTIBYTE (string);
14162469 793 EMACS_INT nbytes = SBYTES (string);
8f924df7 794 unsigned char *p = SDATA (string);
0168c3d8 795 unsigned char *pend = p + nbytes;
14162469 796 EMACS_INT count = 0;
0168c3d8
KH
797 int c, len;
798
799 if (multibyte)
800 while (p < pend)
801 {
802 c = *p;
803 len = BYTES_BY_CHAR_HEAD (c);
804
805 if (CHAR_BYTE8_HEAD_P (c))
806 count++;
807 p += len;
808 }
809 else
810 while (p < pend)
811 {
812 if (*p++ >= 0x80)
813 count++;
814 }
815 return count;
816}
817
818
819Lisp_Object
971de7fb 820string_escape_byte8 (Lisp_Object string)
0168c3d8 821{
14162469
EZ
822 EMACS_INT nchars = SCHARS (string);
823 EMACS_INT nbytes = SBYTES (string);
0168c3d8 824 int multibyte = STRING_MULTIBYTE (string);
14162469 825 EMACS_INT byte8_count;
15843e6f
KH
826 const unsigned char *src, *src_end;
827 unsigned char *dst;
0168c3d8
KH
828 Lisp_Object val;
829 int c, len;
830
831 if (multibyte && nchars == nbytes)
832 return string;
833
834 byte8_count = string_count_byte8 (string);
835
836 if (byte8_count == 0)
837 return string;
838
839 if (multibyte)
14162469
EZ
840 {
841 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
c9d624c6 842 || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
cb93f9be 843 string_overflow ();
14162469
EZ
844
845 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
846 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
847 nbytes + byte8_count * 2);
848 }
0168c3d8 849 else
14162469 850 {
c9d624c6 851 if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
cb93f9be 852 string_overflow ();
ef3ff036 853
14162469
EZ
854 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
855 val = make_uninit_string (nbytes + byte8_count * 3);
856 }
0168c3d8 857
8f924df7 858 src = SDATA (string);
0168c3d8 859 src_end = src + nbytes;
8f924df7 860 dst = SDATA (val);
0168c3d8
KH
861 if (multibyte)
862 while (src < src_end)
863 {
864 c = *src;
865 len = BYTES_BY_CHAR_HEAD (c);
866
867 if (CHAR_BYTE8_HEAD_P (c))
868 {
869 c = STRING_CHAR_ADVANCE (src);
870 c = CHAR_TO_BYTE8 (c);
1889b238 871 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
872 dst += 4;
873 }
874 else
875 while (len--) *dst++ = *src++;
876 }
877 else
878 while (src < src_end)
879 {
880 c = *src++;
881 if (c >= 0x80)
882 {
1889b238 883 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
884 dst += 4;
885 }
886 else
887 *dst++ = c;
888 }
889 return val;
890}
891
892\f
a7ca3326 893DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 894 doc: /*
d2e83296
DL
895Concatenate all the argument characters and make the result a string.
896usage: (string &rest CHARACTERS) */)
f66c7cf8 897 (ptrdiff_t n, Lisp_Object *args)
0168c3d8 898{
f66c7cf8 899 ptrdiff_t i;
c5101a77 900 int c;
754790b6
CY
901 unsigned char *buf, *p;
902 Lisp_Object str;
903 USE_SAFE_ALLOCA;
904
0065d054 905 SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
754790b6 906 p = buf;
0168c3d8
KH
907
908 for (i = 0; i < n; i++)
909 {
910 CHECK_CHARACTER (args[i]);
911 c = XINT (args[i]);
912 p += CHAR_STRING (c, p);
913 }
914
754790b6
CY
915 str = make_string_from_bytes ((char *) buf, n, p - buf);
916 SAFE_FREE ();
917 return str;
0168c3d8
KH
918}
919
70b4969d 920DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
921 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
922usage: (unibyte-string &rest BYTES) */)
f66c7cf8 923 (ptrdiff_t n, Lisp_Object *args)
70b4969d 924{
f66c7cf8 925 ptrdiff_t i;
c5101a77 926 int c;
754790b6
CY
927 unsigned char *buf, *p;
928 Lisp_Object str;
929 USE_SAFE_ALLOCA;
930
931 SAFE_ALLOCA (buf, unsigned char *, n);
932 p = buf;
70b4969d
KH
933
934 for (i = 0; i < n; i++)
935 {
936 CHECK_NATNUM (args[i]);
937 c = XFASTINT (args[i]);
938 if (c >= 256)
939 args_out_of_range_3 (args[i], make_number (0), make_number (255));
940 *p++ = c;
941 }
942
754790b6
CY
943 str = make_string_from_bytes ((char *) buf, n, p - buf);
944 SAFE_FREE ();
945 return str;
70b4969d
KH
946}
947
c73ae4ae 948DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
949 Schar_resolve_modifiers, 1, 1, 0,
950 doc: /* Resolve modifiers in the character CHAR.
951The value is a character with modifiers resolved into the character
952code. Unresolved modifiers are kept in the value.
c73ae4ae 953usage: (char-resolve-modifiers CHAR) */)
5842a27b 954 (Lisp_Object character)
d0363d44
KH
955{
956 int c;
957
958 CHECK_NUMBER (character);
959 c = XINT (character);
960 return make_number (char_resolve_modifier_mask (c));
961}
962
ee107a89
KH
963DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
964 doc: /* Return a byte value of a character at point.
965Optional 1st arg POSITION, if non-nil, is a position of a character to get
966a byte value.
967Optional 2nd arg STRING, if non-nil, is a string of which first
968character is a target to get a byte value. In this case, POSITION, if
969non-nil, is an index of a target character in the string.
970
971If the current buffer (or STRING) is multibyte, and the target
972character is not ASCII nor 8-bit character, an error is signalled. */)
5842a27b 973 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
974{
975 int c;
976 EMACS_INT pos;
977 unsigned char *p;
978
979 if (NILP (string))
980 {
981 if (NILP (position))
982 {
983 p = PT_ADDR;
4d8e170e 984 }
ee107a89
KH
985 else
986 {
987 CHECK_NUMBER_COERCE_MARKER (position);
988 if (XINT (position) < BEGV || XINT (position) >= ZV)
989 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
990 pos = XFASTINT (position);
991 p = CHAR_POS_ADDR (pos);
992 }
4b4deea2 993 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
d5998e03 994 return make_number (*p);
ee107a89
KH
995 }
996 else
997 {
998 CHECK_STRING (string);
999 if (NILP (position))
1000 {
1001 p = SDATA (string);
1002 }
1003 else
1004 {
1005 CHECK_NATNUM (position);
1006 if (XINT (position) >= SCHARS (string))
1007 args_out_of_range (string, position);
1008 pos = XFASTINT (position);
1009 p = SDATA (string) + string_char_to_byte (string, pos);
1010 }
d5998e03
KH
1011 if (! STRING_MULTIBYTE (string))
1012 return make_number (*p);
ee107a89 1013 }
62a6e103 1014 c = STRING_CHAR (p);
ee107a89
KH
1015 if (CHAR_BYTE8_P (c))
1016 c = CHAR_TO_BYTE8 (c);
1017 else if (! ASCII_CHAR_P (c))
1018 error ("Not an ASCII nor an 8-bit character: %d", c);
1019 return make_number (c);
1020}
1021
1022
0168c3d8 1023void
971de7fb 1024init_character_once (void)
0168c3d8
KH
1025{
1026}
1027
1028#ifdef emacs
1029
1030void
971de7fb 1031syms_of_character (void)
0168c3d8
KH
1032{
1033 DEFSYM (Qcharacterp, "characterp");
1034 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1035
1036 staticpro (&Vchar_unify_table);
1037 Vchar_unify_table = Qnil;
1038
1039 defsubr (&Smax_char);
1040 defsubr (&Scharacterp);
1041 defsubr (&Sunibyte_char_to_multibyte);
1042 defsubr (&Smultibyte_char_to_unibyte);
0168c3d8
KH
1043 defsubr (&Schar_width);
1044 defsubr (&Sstring_width);
0168c3d8 1045 defsubr (&Sstring);
70b4969d 1046 defsubr (&Sunibyte_string);
d0363d44 1047 defsubr (&Schar_resolve_modifiers);
ee107a89 1048 defsubr (&Sget_byte);
0168c3d8 1049
29208e82 1050 DEFVAR_LISP ("translation-table-vector", Vtranslation_table_vector,
0168c3d8 1051 doc: /*
68978cf0
DL
1052Vector recording all translation tables ever defined.
1053Each element is a pair (SYMBOL . TABLE) relating the table to the
1054symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1055 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1056
29208e82 1057 DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
0168c3d8
KH
1058 doc: /*
1059A char-table for characters which invoke auto-filling.
1060Such characters have value t in this table. */);
1061 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1062 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1063 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8 1064
29208e82 1065 DEFVAR_LISP ("char-width-table", Vchar_width_table,
0168c3d8
KH
1066 doc: /*
1067A char-table for width (columns) of each character. */);
1068 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1069 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1070 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1071 make_number (4));
0168c3d8 1072
29208e82 1073 DEFVAR_LISP ("printable-chars", Vprintable_chars,
0168c3d8 1074 doc: /* A char-table for each printable character. */);
db6d4189 1075 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1076 Fset_char_table_range (Vprintable_chars,
1077 Fcons (make_number (32), make_number (126)), Qt);
1078 Fset_char_table_range (Vprintable_chars,
1079 Fcons (make_number (160),
1080 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1081
29208e82 1082 DEFVAR_LISP ("char-script-table", Vchar_script_table,
c57f3328
KH
1083 doc: /* Char table of script symbols.
1084It has one extra slot whose value is a list of script symbols. */);
1085
1086 /* Intern this now in case it isn't already done.
1087 Setting this variable twice is harmless.
1088 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1089 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1090 DEFSYM (Qchar_script_table, "char-script-table");
1091 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1092 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352 1093
29208e82 1094 DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
c3bb7671 1095 doc: /* Alist of scripts vs the representative characters.
f4427a54 1096Each element is a cons (SCRIPT . CHARS).
4d8e170e 1097SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1098CHARS is a list or a vector of characters.
472a4dc9 1099If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1100If it is a vector, one of the characters in the vector is necessary.
1101This variable is used to find a font for a specific script. */);
c7e14352 1102 Vscript_representative_chars = Qnil;
a3cbb631 1103
29208e82 1104 DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
a3cbb631 1105 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1106All Unicode characters have one of the following values (symbol):
1107 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1108 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1109See The Unicode Standard for the meaning of those values. */);
1110 /* The correct char-table is setup in characters.el. */
1111 Vunicode_category_table = Qnil;
0168c3d8
KH
1112}
1113
1114#endif /* emacs */