Delete char-direction-table and char-direction.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8 1/* Basic character support.
73b0cd50
GM
2
3Copyright (C) 2001-2011 Free Software Foundation, Inc.
4Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
0168c3d8
KH
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
955cbe7b 53static Lisp_Object Qauto_fill_chars;
0168c3d8 54
33f91981 55/* Char-table of information about which character to unify to which
6f1b43a0 56 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
57Lisp_Object Vchar_unify_table;
58
8973478b 59/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 60unsigned char *_fetch_multibyte_char_p;
0168c3d8 61
c57f3328
KH
62static Lisp_Object Qchar_script_table;
63
0168c3d8
KH
64\f
65
2bde7652
KH
66/* If character code C has modifier masks, reflect them to the
67 character code if possible. Return the resulting code. */
68
69int
971de7fb 70char_resolve_modifier_mask (int c)
2bde7652 71{
d0363d44 72 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
73 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
74 return c;
75
76 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
77 if (c & CHAR_SHIFT)
78 {
79 /* Shift modifier is valid only with [A-Za-z]. */
80 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
81 c &= ~CHAR_SHIFT;
82 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
83 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
84 /* Shift modifier for control characters and SPC is ignored. */
85 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
86 c &= ~CHAR_SHIFT;
87 }
2bde7652
KH
88 if (c & CHAR_CTL)
89 {
90 /* Simulate the code in lread.c. */
91 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
92 if ((c & 0377) == ' ')
93 c &= ~0177 & ~ CHAR_CTL;
94 else if ((c & 0377) == '?')
95 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
96 /* ASCII control chars are made from letters (both cases),
97 as well as the non-letters within 0100...0137. */
98 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
99 c &= (037 | (~0177 & ~CHAR_CTL));
100 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
101 c &= (037 | (~0177 & ~CHAR_CTL));
102 }
e9c1637d 103#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
104 if (c & CHAR_META)
105 {
106 /* Move the meta bit to the right place for a string. */
107 c = (c & ~CHAR_META) | 0x80;
108 }
e9c1637d 109#endif
2bde7652
KH
110
111 return c;
112}
113
114
33f91981
KH
115/* Store multibyte form of character C at P. If C has modifier bits,
116 handle them appropriately. */
117
0168c3d8 118int
971de7fb 119char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
120{
121 int bytes;
122
e3d8eb8c
KH
123 if (c & CHAR_MODIFIER_MASK)
124 {
2bde7652 125 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
126 /* If C still has any modifier bits, just ignore it. */
127 c &= ~CHAR_MODIFIER_MASK;
128 }
129
0168c3d8
KH
130 MAYBE_UNIFY_CHAR (c);
131
e3d8eb8c 132 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
133 {
134 bytes = CHAR_STRING (c, p);
135 }
136 else if (c <= MAX_4_BYTE_CHAR)
137 {
138 p[0] = (0xF0 | (c >> 18));
139 p[1] = (0x80 | ((c >> 12) & 0x3F));
140 p[2] = (0x80 | ((c >> 6) & 0x3F));
141 p[3] = (0x80 | (c & 0x3F));
142 bytes = 4;
143 }
e3d8eb8c 144 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
145 {
146 p[0] = 0xF8;
147 p[1] = (0x80 | ((c >> 18) & 0x0F));
148 p[2] = (0x80 | ((c >> 12) & 0x3F));
149 p[3] = (0x80 | ((c >> 6) & 0x3F));
150 p[4] = (0x80 | (c & 0x3F));
151 bytes = 5;
152 }
5aa91c9b 153 else if (c <= MAX_CHAR)
e3d8eb8c
KH
154 {
155 c = CHAR_TO_BYTE8 (c);
156 bytes = BYTE8_STRING (c, p);
157 }
5aa91c9b 158 else
e6c3da20 159 error ("Invalid character: %x", c);
1889b238 160
0168c3d8
KH
161 return bytes;
162}
163
164
224a3131 165/* Return a character whose multibyte form is at P. If LEN is not
33f91981 166 NULL, it must be a pointer to integer. In that case, set *LEN to
224a3131 167 the byte length of the multibyte form. If ADVANCED is not NULL, it
33f91981 168 must be a pointer to unsigned char. In that case, set *ADVANCED to
224a3131 169 the ending address (i.e., the starting address of the next
33f91981
KH
170 character) of the multibyte form. */
171
0168c3d8 172int
971de7fb 173string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 174{
1889b238 175 int c;
15843e6f 176 const unsigned char *saved_p = p;
0168c3d8
KH
177
178 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
179 {
180 c = STRING_CHAR_ADVANCE (p);
181 }
182 else if (! (*p & 0x08))
183 {
184 c = ((((p)[0] & 0xF) << 18)
185 | (((p)[1] & 0x3F) << 12)
186 | (((p)[2] & 0x3F) << 6)
187 | ((p)[3] & 0x3F));
188 p += 4;
189 }
190 else
191 {
192 c = ((((p)[1] & 0x3F) << 18)
193 | (((p)[2] & 0x3F) << 12)
194 | (((p)[3] & 0x3F) << 6)
195 | ((p)[4] & 0x3F));
196 p += 5;
197 }
198
199 MAYBE_UNIFY_CHAR (c);
200
201 if (len)
202 *len = p - saved_p;
203 if (advanced)
204 *advanced = p;
205 return c;
206}
207
208
224a3131
EZ
209/* Translate character C by translation table TABLE. If no translation is
210 found in TABLE, return the untranslated character. If TABLE is a list,
211 elements are char tables. In that case, recursively translate C by all the
212 tables in the list. */
0168c3d8
KH
213
214int
971de7fb 215translate_char (Lisp_Object table, int c)
0168c3d8 216{
10453be9
KH
217 if (CHAR_TABLE_P (table))
218 {
219 Lisp_Object ch;
220
221 ch = CHAR_TABLE_REF (table, c);
222 if (CHARACTERP (ch))
223 c = XINT (ch);
224 }
225 else
226 {
227 for (; CONSP (table); table = XCDR (table))
228 c = translate_char (XCAR (table), c);
229 }
230 return c;
0168c3d8
KH
231}
232
2e5db15c 233/* Convert ASCII or 8-bit character C to unibyte. If C is none of
461c2ab9 234 them, return (C & 0xFF). */
0168c3d8
KH
235
236int
461c2ab9 237multibyte_char_to_unibyte (int c)
0168c3d8 238{
2e5db15c
KH
239 if (c < 0x80)
240 return c;
b672c5ae
KH
241 if (CHAR_BYTE8_P (c))
242 return CHAR_TO_BYTE8 (c);
2e5db15c 243 return (c & 0xFF);
0168c3d8
KH
244}
245
935d5b02
KH
246/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
247 by charset_unibyte. */
248
249int
971de7fb 250multibyte_char_to_unibyte_safe (int c)
935d5b02 251{
2e5db15c
KH
252 if (c < 0x80)
253 return c;
935d5b02
KH
254 if (CHAR_BYTE8_P (c))
255 return CHAR_TO_BYTE8 (c);
2e5db15c 256 return -1;
935d5b02 257}
0168c3d8
KH
258
259DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
260 doc: /* Return non-nil if OBJECT is a character. */)
5842a27b 261 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
262{
263 return (CHARACTERP (object) ? Qt : Qnil);
264}
265
266DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
267 doc: /* Return the character of the maximum code. */)
5842a27b 268 (void)
0168c3d8
KH
269{
270 return make_number (MAX_CHAR);
271}
272
273DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
274 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 275 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 276 (Lisp_Object ch)
0168c3d8
KH
277{
278 int c;
0168c3d8
KH
279
280 CHECK_CHARACTER (ch);
281 c = XFASTINT (ch);
2e5db15c
KH
282 if (c >= 0x100)
283 error ("Not a unibyte character: %d", c);
4c0354d7 284 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
285 return make_number (c);
286}
287
288DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
289 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
290 doc: /* Convert the multibyte character CH to a byte.
291If the multibyte character does not represent a byte, return -1. */)
5842a27b 292 (Lisp_Object ch)
0168c3d8 293{
5556875b 294 int cm;
0168c3d8
KH
295
296 CHECK_CHARACTER (ch);
5556875b
SM
297 cm = XFASTINT (ch);
298 if (cm < 256)
299 /* Can't distinguish a byte read from a unibyte buffer from
300 a latin1 char, so let's let it slide. */
301 return ch;
302 else
303 {
2afc21f5 304 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
305 return make_number (cu);
306 }
0168c3d8
KH
307}
308
a7ca3326 309DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
0168c3d8
KH
310 doc: /* Return width of CHAR when displayed in the current buffer.
311The width is measured by how many columns it occupies on the screen.
add553ac
JB
312Tab is taken to occupy `tab-width' columns.
313usage: (char-width CHAR) */)
5842a27b 314 (Lisp_Object ch)
0168c3d8
KH
315{
316 Lisp_Object disp;
317 int c, width;
318 struct Lisp_Char_Table *dp = buffer_display_table ();
319
320 CHECK_CHARACTER (ch);
321 c = XINT (ch);
322
323 /* Get the way the display table would display it. */
324 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
325
326 if (VECTORP (disp))
327 width = ASIZE (disp);
328 else
329 width = CHAR_WIDTH (c);
330
331 return make_number (width);
332}
333
0168c3d8
KH
334/* Return width of string STR of length LEN when displayed in the
335 current buffer. The width is measured by how many columns it
336 occupies on the screen. If PRECISION > 0, return the width of
337 longest substring that doesn't exceed PRECISION, and set number of
338 characters and bytes of the substring in *NCHARS and *NBYTES
339 respectively. */
340
579c18d0
EZ
341EMACS_INT
342c_string_width (const unsigned char *str, EMACS_INT len, int precision,
343 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 344{
579c18d0
EZ
345 EMACS_INT i = 0, i_byte = 0;
346 EMACS_INT width = 0;
0168c3d8
KH
347 struct Lisp_Char_Table *dp = buffer_display_table ();
348
349 while (i_byte < len)
350 {
351 int bytes, thiswidth;
352 Lisp_Object val;
62a6e103 353 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
354
355 if (dp)
356 {
357 val = DISP_CHAR_VECTOR (dp, c);
358 if (VECTORP (val))
359 thiswidth = XVECTOR (val)->size;
360 else
361 thiswidth = CHAR_WIDTH (c);
362 }
363 else
364 {
365 thiswidth = CHAR_WIDTH (c);
366 }
367
368 if (precision > 0
369 && (width + thiswidth > precision))
370 {
371 *nchars = i;
372 *nbytes = i_byte;
373 return width;
374 }
375 i++;
376 i_byte += bytes;
377 width += thiswidth;
378 }
379
380 if (precision > 0)
381 {
382 *nchars = i;
383 *nbytes = i_byte;
384 }
385
386 return width;
387}
388
1889b238
KH
389/* Return width of string STR of length LEN when displayed in the
390 current buffer. The width is measured by how many columns it
391 occupies on the screen. */
392
579c18d0 393EMACS_INT
7469ef5d 394strwidth (const char *str, EMACS_INT len)
1889b238 395{
7469ef5d 396 return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
1889b238
KH
397}
398
0168c3d8
KH
399/* Return width of Lisp string STRING when displayed in the current
400 buffer. The width is measured by how many columns it occupies on
401 the screen while paying attention to compositions. If PRECISION >
402 0, return the width of longest substring that doesn't exceed
403 PRECISION, and set number of characters and bytes of the substring
404 in *NCHARS and *NBYTES respectively. */
405
579c18d0
EZ
406EMACS_INT
407lisp_string_width (Lisp_Object string, int precision,
408 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 409{
579c18d0 410 EMACS_INT len = SCHARS (string);
0aee65b9
KH
411 /* This set multibyte to 0 even if STRING is multibyte when it
412 contains only ascii and eight-bit-graphic, but that's
413 intentional. */
414 int multibyte = len < SBYTES (string);
8f924df7 415 unsigned char *str = SDATA (string);
579c18d0
EZ
416 EMACS_INT i = 0, i_byte = 0;
417 EMACS_INT width = 0;
0168c3d8
KH
418 struct Lisp_Char_Table *dp = buffer_display_table ();
419
420 while (i < len)
421 {
422 int chars, bytes, thiswidth;
423 Lisp_Object val;
424 int cmp_id;
f4bc0685 425 EMACS_INT ignore, end;
0168c3d8
KH
426
427 if (find_composition (i, -1, &ignore, &end, &val, string)
428 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
429 >= 0))
430 {
431 thiswidth = composition_table[cmp_id]->width;
432 chars = end - i;
433 bytes = string_char_to_byte (string, end) - i_byte;
434 }
0168c3d8
KH
435 else
436 {
0aee65b9 437 int c;
0168c3d8 438
0aee65b9 439 if (multibyte)
62a6e103 440 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0aee65b9
KH
441 else
442 c = str[i_byte], bytes = 1;
0168c3d8 443 chars = 1;
0aee65b9
KH
444 if (dp)
445 {
446 val = DISP_CHAR_VECTOR (dp, c);
447 if (VECTORP (val))
448 thiswidth = XVECTOR (val)->size;
449 else
450 thiswidth = CHAR_WIDTH (c);
451 }
452 else
453 {
454 thiswidth = CHAR_WIDTH (c);
455 }
0168c3d8
KH
456 }
457
458 if (precision > 0
459 && (width + thiswidth > precision))
460 {
461 *nchars = i;
462 *nbytes = i_byte;
463 return width;
464 }
465 i += chars;
466 i_byte += bytes;
467 width += thiswidth;
468 }
469
470 if (precision > 0)
471 {
472 *nchars = i;
473 *nbytes = i_byte;
474 }
475
476 return width;
477}
478
479DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
480 doc: /* Return width of STRING when displayed in the current buffer.
481Width is measured by how many columns it occupies on the screen.
482When calculating width of a multibyte character in STRING,
483only the base leading-code is considered; the validity of
484the following bytes is not checked. Tabs in STRING are always
add553ac
JB
485taken to occupy `tab-width' columns.
486usage: (string-width STRING) */)
5842a27b 487 (Lisp_Object str)
0168c3d8
KH
488{
489 Lisp_Object val;
490
491 CHECK_STRING (str);
492 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
493 return val;
494}
495
0168c3d8
KH
496/* Return the number of characters in the NBYTES bytes at PTR.
497 This works by looking at the contents and checking for multibyte
498 sequences while assuming that there's no invalid sequence.
499 However, if the current buffer has enable-multibyte-characters =
500 nil, we treat each byte as a character. */
501
13818c30 502EMACS_INT
971de7fb 503chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8
KH
504{
505 /* current_buffer is null at early stages of Emacs initialization. */
506 if (current_buffer == 0
4b4deea2 507 || NILP (BVAR (current_buffer, enable_multibyte_characters)))
0168c3d8
KH
508 return nbytes;
509
510 return multibyte_chars_in_text (ptr, nbytes);
511}
512
513/* Return the number of characters in the NBYTES bytes at PTR.
514 This works by looking at the contents and checking for multibyte
515 sequences while assuming that there's no invalid sequence. It
516 ignores enable-multibyte-characters. */
517
13818c30 518EMACS_INT
971de7fb 519multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8 520{
8f924df7 521 const unsigned char *endp = ptr + nbytes;
579c18d0 522 EMACS_INT chars = 0;
0168c3d8
KH
523
524 while (ptr < endp)
525 {
579c18d0 526 EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
527
528 if (len == 0)
529 abort ();
530 ptr += len;
531 chars++;
532 }
533
534 return chars;
535}
536
537/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
538 characters and bytes in it, and store them in *NCHARS and *NBYTES
539 respectively. On counting bytes, pay attention to that 8-bit
540 characters not constructing a valid multibyte sequence are
541 represented by 2-byte in a multibyte text. */
542
543void
14162469
EZ
544parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
545 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 546{
8f924df7 547 const unsigned char *endp = str + len;
14162469 548 EMACS_INT n, chars = 0, bytes = 0;
0168c3d8
KH
549
550 if (len >= MAX_MULTIBYTE_LENGTH)
551 {
8f924df7 552 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
553 while (str < adjusted_endp)
554 {
fc9a17bc
KH
555 if (! CHAR_BYTE8_HEAD_P (*str)
556 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
557 str += n, bytes += n;
558 else
559 str++, bytes += 2;
560 chars++;
561 }
562 }
563 while (str < endp)
564 {
fc9a17bc
KH
565 if (! CHAR_BYTE8_HEAD_P (*str)
566 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
567 str += n, bytes += n;
568 else
569 str++, bytes += 2;
570 chars++;
571 }
572
573 *nchars = chars;
574 *nbytes = bytes;
575 return;
576}
577
578/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
579 It actually converts only such 8-bit characters that don't contruct
580 a multibyte sequence to multibyte forms of Latin-1 characters. If
581 NCHARS is nonzero, set *NCHARS to the number of characters in the
582 text. It is assured that we can use LEN bytes at STR as a work
583 area and that is enough. Return the number of bytes of the
584 resulting text. */
585
14162469
EZ
586EMACS_INT
587str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
588 EMACS_INT *nchars)
0168c3d8
KH
589{
590 unsigned char *p = str, *endp = str + nbytes;
591 unsigned char *to;
14162469 592 EMACS_INT chars = 0;
0168c3d8
KH
593 int n;
594
595 if (nbytes >= MAX_MULTIBYTE_LENGTH)
596 {
597 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
598 while (p < adjusted_endp
fc9a17bc 599 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
600 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
601 p += n, chars++;
602 }
fc9a17bc
KH
603 while (p < endp
604 && ! CHAR_BYTE8_HEAD_P (*p)
605 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
606 p += n, chars++;
607 if (nchars)
608 *nchars = chars;
609 if (p == endp)
610 return nbytes;
611
612 to = p;
613 nbytes = endp - p;
614 endp = str + len;
72af86bd 615 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
616 p = endp - nbytes;
617
618 if (nbytes >= MAX_MULTIBYTE_LENGTH)
619 {
620 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
621 while (p < adjusted_endp)
622 {
fc9a17bc
KH
623 if (! CHAR_BYTE8_HEAD_P (*p)
624 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
625 {
626 while (n--)
627 *to++ = *p++;
628 }
629 else
630 {
631 int c = *p++;
632 c = BYTE8_TO_CHAR (c);
633 to += CHAR_STRING (c, to);
634 }
635 }
636 chars++;
637 }
638 while (p < endp)
639 {
fc9a17bc
KH
640 if (! CHAR_BYTE8_HEAD_P (*p)
641 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
642 {
643 while (n--)
644 *to++ = *p++;
8f924df7 645 }
0168c3d8
KH
646 else
647 {
648 int c = *p++;
649 c = BYTE8_TO_CHAR (c);
650 to += CHAR_STRING (c, to);
651 }
652 chars++;
653 }
654 if (nchars)
655 *nchars = chars;
656 return (to - str);
657}
658
659/* Parse unibyte string at STR of LEN bytes, and return the number of
660 bytes it may ocupy when converted to multibyte string by
661 `str_to_multibyte'. */
662
14162469
EZ
663EMACS_INT
664parse_str_to_multibyte (const unsigned char *str, EMACS_INT len)
0168c3d8 665{
eec47d6b 666 const unsigned char *endp = str + len;
14162469 667 EMACS_INT bytes;
0168c3d8
KH
668
669 for (bytes = 0; str < endp; str++)
670 bytes += (*str < 0x80) ? 1 : 2;
671 return bytes;
672}
673
674
675/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
676 that contains the same single-byte characters. It actually
677 converts all 8-bit characters to multibyte forms. It is assured
678 that we can use LEN bytes at STR as a work area and that is
679 enough. */
680
14162469
EZ
681EMACS_INT
682str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
0168c3d8
KH
683{
684 unsigned char *p = str, *endp = str + bytes;
685 unsigned char *to;
686
687 while (p < endp && *p < 0x80) p++;
688 if (p == endp)
689 return bytes;
690 to = p;
691 bytes = endp - p;
692 endp = str + len;
72af86bd 693 memmove (endp - bytes, p, bytes);
0168c3d8 694 p = endp - bytes;
8f924df7 695 while (p < endp)
0168c3d8
KH
696 {
697 int c = *p++;
698
699 if (c >= 0x80)
700 c = BYTE8_TO_CHAR (c);
701 to += CHAR_STRING (c, to);
702 }
703 return (to - str);
704}
705
706/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
707 actually converts characters in the range 0x80..0xFF to
708 unibyte. */
709
14162469
EZ
710EMACS_INT
711str_as_unibyte (unsigned char *str, EMACS_INT bytes)
0168c3d8 712{
15843e6f
KH
713 const unsigned char *p = str, *endp = str + bytes;
714 unsigned char *to;
0168c3d8
KH
715 int c, len;
716
717 while (p < endp)
718 {
719 c = *p;
720 len = BYTES_BY_CHAR_HEAD (c);
721 if (CHAR_BYTE8_HEAD_P (c))
722 break;
723 p += len;
724 }
15843e6f 725 to = str + (p - str);
8f924df7 726 while (p < endp)
0168c3d8
KH
727 {
728 c = *p;
729 len = BYTES_BY_CHAR_HEAD (c);
730 if (CHAR_BYTE8_HEAD_P (c))
731 {
732 c = STRING_CHAR_ADVANCE (p);
733 *to++ = CHAR_TO_BYTE8 (c);
734 }
735 else
736 {
737 while (len--) *to++ = *p++;
738 }
739 }
740 return (to - str);
741}
742
4aa40bb8
KH
743/* Convert eight-bit chars in SRC (in multibyte form) to the
744 corresponding byte and store in DST. CHARS is the number of
745 characters in SRC. The value is the number of bytes stored in DST.
746 Usually, the value is the same as CHARS, but is less than it if SRC
8307f923 747 contains a non-ASCII, non-eight-bit character. If ACCEPT_LATIN_1
4aa40bb8 748 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
749 of that character code.
750 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
751
752EMACS_INT
971de7fb 753str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
4aa40bb8
KH
754{
755 EMACS_INT i;
756
757 for (i = 0; i < chars; i++)
758 {
759 int c = STRING_CHAR_ADVANCE (src);
760
761 if (CHAR_BYTE8_P (c))
762 c = CHAR_TO_BYTE8 (c);
763 else if (! ASCII_CHAR_P (c)
764 && (! accept_latin_1 || c >= 0x100))
765 return i;
766 *dst++ = c;
767 }
768 return i;
769}
770
771
d0891610 772static EMACS_INT
971de7fb 773string_count_byte8 (Lisp_Object string)
0168c3d8
KH
774{
775 int multibyte = STRING_MULTIBYTE (string);
14162469 776 EMACS_INT nbytes = SBYTES (string);
8f924df7 777 unsigned char *p = SDATA (string);
0168c3d8 778 unsigned char *pend = p + nbytes;
14162469 779 EMACS_INT count = 0;
0168c3d8
KH
780 int c, len;
781
782 if (multibyte)
783 while (p < pend)
784 {
785 c = *p;
786 len = BYTES_BY_CHAR_HEAD (c);
787
788 if (CHAR_BYTE8_HEAD_P (c))
789 count++;
790 p += len;
791 }
792 else
793 while (p < pend)
794 {
795 if (*p++ >= 0x80)
796 count++;
797 }
798 return count;
799}
800
801
802Lisp_Object
971de7fb 803string_escape_byte8 (Lisp_Object string)
0168c3d8 804{
14162469
EZ
805 EMACS_INT nchars = SCHARS (string);
806 EMACS_INT nbytes = SBYTES (string);
0168c3d8 807 int multibyte = STRING_MULTIBYTE (string);
14162469 808 EMACS_INT byte8_count;
15843e6f
KH
809 const unsigned char *src, *src_end;
810 unsigned char *dst;
0168c3d8
KH
811 Lisp_Object val;
812 int c, len;
813
814 if (multibyte && nchars == nbytes)
815 return string;
816
817 byte8_count = string_count_byte8 (string);
818
819 if (byte8_count == 0)
820 return string;
821
822 if (multibyte)
14162469
EZ
823 {
824 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
825 || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
826 error ("Maximum string size exceeded");
827
828 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
829 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
830 nbytes + byte8_count * 2);
831 }
0168c3d8 832 else
14162469
EZ
833 {
834 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
835 error ("Maximum string size exceeded");
836 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
837 val = make_uninit_string (nbytes + byte8_count * 3);
838 }
0168c3d8 839
8f924df7 840 src = SDATA (string);
0168c3d8 841 src_end = src + nbytes;
8f924df7 842 dst = SDATA (val);
0168c3d8
KH
843 if (multibyte)
844 while (src < src_end)
845 {
846 c = *src;
847 len = BYTES_BY_CHAR_HEAD (c);
848
849 if (CHAR_BYTE8_HEAD_P (c))
850 {
851 c = STRING_CHAR_ADVANCE (src);
852 c = CHAR_TO_BYTE8 (c);
1889b238 853 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
854 dst += 4;
855 }
856 else
857 while (len--) *dst++ = *src++;
858 }
859 else
860 while (src < src_end)
861 {
862 c = *src++;
863 if (c >= 0x80)
864 {
1889b238 865 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
866 dst += 4;
867 }
868 else
869 *dst++ = c;
870 }
871 return val;
872}
873
874\f
a7ca3326 875DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 876 doc: /*
d2e83296
DL
877Concatenate all the argument characters and make the result a string.
878usage: (string &rest CHARACTERS) */)
c5101a77 879 (size_t n, Lisp_Object *args)
0168c3d8 880{
c5101a77
PE
881 size_t i;
882 int c;
754790b6
CY
883 unsigned char *buf, *p;
884 Lisp_Object str;
885 USE_SAFE_ALLOCA;
886
887 SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
888 p = buf;
0168c3d8
KH
889
890 for (i = 0; i < n; i++)
891 {
892 CHECK_CHARACTER (args[i]);
893 c = XINT (args[i]);
894 p += CHAR_STRING (c, p);
895 }
896
754790b6
CY
897 str = make_string_from_bytes ((char *) buf, n, p - buf);
898 SAFE_FREE ();
899 return str;
0168c3d8
KH
900}
901
70b4969d 902DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
903 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
904usage: (unibyte-string &rest BYTES) */)
c5101a77 905 (size_t n, Lisp_Object *args)
70b4969d 906{
c5101a77
PE
907 size_t i;
908 int c;
754790b6
CY
909 unsigned char *buf, *p;
910 Lisp_Object str;
911 USE_SAFE_ALLOCA;
912
913 SAFE_ALLOCA (buf, unsigned char *, n);
914 p = buf;
70b4969d
KH
915
916 for (i = 0; i < n; i++)
917 {
918 CHECK_NATNUM (args[i]);
919 c = XFASTINT (args[i]);
920 if (c >= 256)
921 args_out_of_range_3 (args[i], make_number (0), make_number (255));
922 *p++ = c;
923 }
924
754790b6
CY
925 str = make_string_from_bytes ((char *) buf, n, p - buf);
926 SAFE_FREE ();
927 return str;
70b4969d
KH
928}
929
c73ae4ae 930DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
931 Schar_resolve_modifiers, 1, 1, 0,
932 doc: /* Resolve modifiers in the character CHAR.
933The value is a character with modifiers resolved into the character
934code. Unresolved modifiers are kept in the value.
c73ae4ae 935usage: (char-resolve-modifiers CHAR) */)
5842a27b 936 (Lisp_Object character)
d0363d44
KH
937{
938 int c;
939
940 CHECK_NUMBER (character);
941 c = XINT (character);
942 return make_number (char_resolve_modifier_mask (c));
943}
944
ee107a89
KH
945DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
946 doc: /* Return a byte value of a character at point.
947Optional 1st arg POSITION, if non-nil, is a position of a character to get
948a byte value.
949Optional 2nd arg STRING, if non-nil, is a string of which first
950character is a target to get a byte value. In this case, POSITION, if
951non-nil, is an index of a target character in the string.
952
953If the current buffer (or STRING) is multibyte, and the target
954character is not ASCII nor 8-bit character, an error is signalled. */)
5842a27b 955 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
956{
957 int c;
958 EMACS_INT pos;
959 unsigned char *p;
960
961 if (NILP (string))
962 {
963 if (NILP (position))
964 {
965 p = PT_ADDR;
4d8e170e 966 }
ee107a89
KH
967 else
968 {
969 CHECK_NUMBER_COERCE_MARKER (position);
970 if (XINT (position) < BEGV || XINT (position) >= ZV)
971 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
972 pos = XFASTINT (position);
973 p = CHAR_POS_ADDR (pos);
974 }
4b4deea2 975 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
d5998e03 976 return make_number (*p);
ee107a89
KH
977 }
978 else
979 {
980 CHECK_STRING (string);
981 if (NILP (position))
982 {
983 p = SDATA (string);
984 }
985 else
986 {
987 CHECK_NATNUM (position);
988 if (XINT (position) >= SCHARS (string))
989 args_out_of_range (string, position);
990 pos = XFASTINT (position);
991 p = SDATA (string) + string_char_to_byte (string, pos);
992 }
d5998e03
KH
993 if (! STRING_MULTIBYTE (string))
994 return make_number (*p);
ee107a89 995 }
62a6e103 996 c = STRING_CHAR (p);
ee107a89
KH
997 if (CHAR_BYTE8_P (c))
998 c = CHAR_TO_BYTE8 (c);
999 else if (! ASCII_CHAR_P (c))
1000 error ("Not an ASCII nor an 8-bit character: %d", c);
1001 return make_number (c);
1002}
1003
1004
0168c3d8 1005void
971de7fb 1006init_character_once (void)
0168c3d8
KH
1007{
1008}
1009
1010#ifdef emacs
1011
1012void
971de7fb 1013syms_of_character (void)
0168c3d8
KH
1014{
1015 DEFSYM (Qcharacterp, "characterp");
1016 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1017
1018 staticpro (&Vchar_unify_table);
1019 Vchar_unify_table = Qnil;
1020
1021 defsubr (&Smax_char);
1022 defsubr (&Scharacterp);
1023 defsubr (&Sunibyte_char_to_multibyte);
1024 defsubr (&Smultibyte_char_to_unibyte);
0168c3d8
KH
1025 defsubr (&Schar_width);
1026 defsubr (&Sstring_width);
0168c3d8 1027 defsubr (&Sstring);
70b4969d 1028 defsubr (&Sunibyte_string);
d0363d44 1029 defsubr (&Schar_resolve_modifiers);
ee107a89 1030 defsubr (&Sget_byte);
0168c3d8 1031
29208e82 1032 DEFVAR_LISP ("translation-table-vector", Vtranslation_table_vector,
0168c3d8 1033 doc: /*
68978cf0
DL
1034Vector recording all translation tables ever defined.
1035Each element is a pair (SYMBOL . TABLE) relating the table to the
1036symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1037 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1038
29208e82 1039 DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
0168c3d8
KH
1040 doc: /*
1041A char-table for characters which invoke auto-filling.
1042Such characters have value t in this table. */);
1043 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1044 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1045 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8 1046
29208e82 1047 DEFVAR_LISP ("char-width-table", Vchar_width_table,
0168c3d8
KH
1048 doc: /*
1049A char-table for width (columns) of each character. */);
1050 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1051 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1052 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1053 make_number (4));
0168c3d8 1054
29208e82 1055 DEFVAR_LISP ("printable-chars", Vprintable_chars,
0168c3d8 1056 doc: /* A char-table for each printable character. */);
db6d4189 1057 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1058 Fset_char_table_range (Vprintable_chars,
1059 Fcons (make_number (32), make_number (126)), Qt);
1060 Fset_char_table_range (Vprintable_chars,
1061 Fcons (make_number (160),
1062 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1063
29208e82 1064 DEFVAR_LISP ("char-script-table", Vchar_script_table,
c57f3328
KH
1065 doc: /* Char table of script symbols.
1066It has one extra slot whose value is a list of script symbols. */);
1067
1068 /* Intern this now in case it isn't already done.
1069 Setting this variable twice is harmless.
1070 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1071 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1072 DEFSYM (Qchar_script_table, "char-script-table");
1073 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1074 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352 1075
29208e82 1076 DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
c3bb7671 1077 doc: /* Alist of scripts vs the representative characters.
f4427a54 1078Each element is a cons (SCRIPT . CHARS).
4d8e170e 1079SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1080CHARS is a list or a vector of characters.
472a4dc9 1081If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1082If it is a vector, one of the characters in the vector is necessary.
1083This variable is used to find a font for a specific script. */);
c7e14352 1084 Vscript_representative_chars = Qnil;
a3cbb631 1085
29208e82 1086 DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
a3cbb631 1087 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1088All Unicode characters have one of the following values (symbol):
1089 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1090 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1091See The Unicode Standard for the meaning of those values. */);
1092 /* The correct char-table is setup in characters.el. */
1093 Vunicode_category_table = Qnil;
0168c3d8
KH
1094}
1095
1096#endif /* emacs */