* callproc.c: conform to C89 pointer rules
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8 1/* Basic character support.
73b0cd50
GM
2
3Copyright (C) 2001-2011 Free Software Foundation, Inc.
4Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
0168c3d8
KH
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
0168c3d8
KH
53Lisp_Object Qauto_fill_chars;
54
33f91981 55/* Char-table of information about which character to unify to which
6f1b43a0 56 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
57Lisp_Object Vchar_unify_table;
58
8973478b 59/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 60unsigned char *_fetch_multibyte_char_p;
0168c3d8 61
c57f3328
KH
62static Lisp_Object Qchar_script_table;
63
0168c3d8
KH
64\f
65
2bde7652
KH
66/* If character code C has modifier masks, reflect them to the
67 character code if possible. Return the resulting code. */
68
69int
971de7fb 70char_resolve_modifier_mask (int c)
2bde7652 71{
d0363d44 72 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
73 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
74 return c;
75
76 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
77 if (c & CHAR_SHIFT)
78 {
79 /* Shift modifier is valid only with [A-Za-z]. */
80 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
81 c &= ~CHAR_SHIFT;
82 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
83 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
84 /* Shift modifier for control characters and SPC is ignored. */
85 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
86 c &= ~CHAR_SHIFT;
87 }
2bde7652
KH
88 if (c & CHAR_CTL)
89 {
90 /* Simulate the code in lread.c. */
91 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
92 if ((c & 0377) == ' ')
93 c &= ~0177 & ~ CHAR_CTL;
94 else if ((c & 0377) == '?')
95 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
96 /* ASCII control chars are made from letters (both cases),
97 as well as the non-letters within 0100...0137. */
98 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
99 c &= (037 | (~0177 & ~CHAR_CTL));
100 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
101 c &= (037 | (~0177 & ~CHAR_CTL));
102 }
e9c1637d 103#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
104 if (c & CHAR_META)
105 {
106 /* Move the meta bit to the right place for a string. */
107 c = (c & ~CHAR_META) | 0x80;
108 }
e9c1637d 109#endif
2bde7652
KH
110
111 return c;
112}
113
114
33f91981
KH
115/* Store multibyte form of character C at P. If C has modifier bits,
116 handle them appropriately. */
117
0168c3d8 118int
971de7fb 119char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
120{
121 int bytes;
122
e3d8eb8c
KH
123 if (c & CHAR_MODIFIER_MASK)
124 {
2bde7652 125 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
126 /* If C still has any modifier bits, just ignore it. */
127 c &= ~CHAR_MODIFIER_MASK;
128 }
129
0168c3d8
KH
130 MAYBE_UNIFY_CHAR (c);
131
e3d8eb8c 132 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
133 {
134 bytes = CHAR_STRING (c, p);
135 }
136 else if (c <= MAX_4_BYTE_CHAR)
137 {
138 p[0] = (0xF0 | (c >> 18));
139 p[1] = (0x80 | ((c >> 12) & 0x3F));
140 p[2] = (0x80 | ((c >> 6) & 0x3F));
141 p[3] = (0x80 | (c & 0x3F));
142 bytes = 4;
143 }
e3d8eb8c 144 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
145 {
146 p[0] = 0xF8;
147 p[1] = (0x80 | ((c >> 18) & 0x0F));
148 p[2] = (0x80 | ((c >> 12) & 0x3F));
149 p[3] = (0x80 | ((c >> 6) & 0x3F));
150 p[4] = (0x80 | (c & 0x3F));
151 bytes = 5;
152 }
5aa91c9b 153 else if (c <= MAX_CHAR)
e3d8eb8c
KH
154 {
155 c = CHAR_TO_BYTE8 (c);
156 bytes = BYTE8_STRING (c, p);
157 }
5aa91c9b
KH
158 else
159 error ("Invalid character: %d", c);
1889b238 160
0168c3d8
KH
161 return bytes;
162}
163
164
33f91981
KH
165/* Return a character whose multibyte form is at P. Set LEN is not
166 NULL, it must be a pointer to integer. In that case, set *LEN to
167 the byte length of the multibyte form. If ADVANCED is not NULL, is
168 must be a pointer to unsigned char. In that case, set *ADVANCED to
169 the ending address (i.e. the starting address of the next
170 character) of the multibyte form. */
171
0168c3d8 172int
971de7fb 173string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 174{
1889b238 175 int c;
15843e6f 176 const unsigned char *saved_p = p;
0168c3d8
KH
177
178 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
179 {
180 c = STRING_CHAR_ADVANCE (p);
181 }
182 else if (! (*p & 0x08))
183 {
184 c = ((((p)[0] & 0xF) << 18)
185 | (((p)[1] & 0x3F) << 12)
186 | (((p)[2] & 0x3F) << 6)
187 | ((p)[3] & 0x3F));
188 p += 4;
189 }
190 else
191 {
192 c = ((((p)[1] & 0x3F) << 18)
193 | (((p)[2] & 0x3F) << 12)
194 | (((p)[3] & 0x3F) << 6)
195 | ((p)[4] & 0x3F));
196 p += 5;
197 }
198
199 MAYBE_UNIFY_CHAR (c);
200
201 if (len)
202 *len = p - saved_p;
203 if (advanced)
204 *advanced = p;
205 return c;
206}
207
208
209/* Translate character C by translation table TABLE. If C is
210 negative, translate a character specified by CHARSET and CODE. If
211 no translation is found in TABLE, return the untranslated
10453be9
KH
212 character. If TABLE is a list, elements are char tables. In this
213 case, translace C by all tables. */
0168c3d8
KH
214
215int
971de7fb 216translate_char (Lisp_Object table, int c)
0168c3d8 217{
10453be9
KH
218 if (CHAR_TABLE_P (table))
219 {
220 Lisp_Object ch;
221
222 ch = CHAR_TABLE_REF (table, c);
223 if (CHARACTERP (ch))
224 c = XINT (ch);
225 }
226 else
227 {
228 for (; CONSP (table); table = XCDR (table))
229 c = translate_char (XCAR (table), c);
230 }
231 return c;
0168c3d8
KH
232}
233
2e5db15c
KH
234/* Convert ASCII or 8-bit character C to unibyte. If C is none of
235 them, return (C & 0xFF).
0168c3d8
KH
236
237 The argument REV_TBL is now ignored. It will be removed in the
238 future. */
239
240int
971de7fb 241multibyte_char_to_unibyte (int c, Lisp_Object rev_tbl)
0168c3d8 242{
2e5db15c
KH
243 if (c < 0x80)
244 return c;
b672c5ae
KH
245 if (CHAR_BYTE8_P (c))
246 return CHAR_TO_BYTE8 (c);
2e5db15c 247 return (c & 0xFF);
0168c3d8
KH
248}
249
935d5b02
KH
250/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
251 by charset_unibyte. */
252
253int
971de7fb 254multibyte_char_to_unibyte_safe (int c)
935d5b02 255{
2e5db15c
KH
256 if (c < 0x80)
257 return c;
935d5b02
KH
258 if (CHAR_BYTE8_P (c))
259 return CHAR_TO_BYTE8 (c);
2e5db15c 260 return -1;
935d5b02 261}
0168c3d8
KH
262
263DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
264 doc: /* Return non-nil if OBJECT is a character. */)
5842a27b 265 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
266{
267 return (CHARACTERP (object) ? Qt : Qnil);
268}
269
270DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
271 doc: /* Return the character of the maximum code. */)
5842a27b 272 (void)
0168c3d8
KH
273{
274 return make_number (MAX_CHAR);
275}
276
277DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
278 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 279 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 280 (Lisp_Object ch)
0168c3d8
KH
281{
282 int c;
0168c3d8
KH
283
284 CHECK_CHARACTER (ch);
285 c = XFASTINT (ch);
2e5db15c
KH
286 if (c >= 0x100)
287 error ("Not a unibyte character: %d", c);
4c0354d7 288 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
289 return make_number (c);
290}
291
292DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
293 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
294 doc: /* Convert the multibyte character CH to a byte.
295If the multibyte character does not represent a byte, return -1. */)
5842a27b 296 (Lisp_Object ch)
0168c3d8 297{
5556875b 298 int cm;
0168c3d8
KH
299
300 CHECK_CHARACTER (ch);
5556875b
SM
301 cm = XFASTINT (ch);
302 if (cm < 256)
303 /* Can't distinguish a byte read from a unibyte buffer from
304 a latin1 char, so let's let it slide. */
305 return ch;
306 else
307 {
2afc21f5 308 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
309 return make_number (cu);
310 }
0168c3d8
KH
311}
312
0168c3d8
KH
313DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
314 doc: /* Return width of CHAR when displayed in the current buffer.
315The width is measured by how many columns it occupies on the screen.
add553ac
JB
316Tab is taken to occupy `tab-width' columns.
317usage: (char-width CHAR) */)
5842a27b 318 (Lisp_Object ch)
0168c3d8
KH
319{
320 Lisp_Object disp;
321 int c, width;
322 struct Lisp_Char_Table *dp = buffer_display_table ();
323
324 CHECK_CHARACTER (ch);
325 c = XINT (ch);
326
327 /* Get the way the display table would display it. */
328 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
329
330 if (VECTORP (disp))
331 width = ASIZE (disp);
332 else
333 width = CHAR_WIDTH (c);
334
335 return make_number (width);
336}
337
0168c3d8
KH
338/* Return width of string STR of length LEN when displayed in the
339 current buffer. The width is measured by how many columns it
340 occupies on the screen. If PRECISION > 0, return the width of
341 longest substring that doesn't exceed PRECISION, and set number of
342 characters and bytes of the substring in *NCHARS and *NBYTES
343 respectively. */
344
579c18d0
EZ
345EMACS_INT
346c_string_width (const unsigned char *str, EMACS_INT len, int precision,
347 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 348{
579c18d0
EZ
349 EMACS_INT i = 0, i_byte = 0;
350 EMACS_INT width = 0;
0168c3d8
KH
351 struct Lisp_Char_Table *dp = buffer_display_table ();
352
353 while (i_byte < len)
354 {
355 int bytes, thiswidth;
356 Lisp_Object val;
62a6e103 357 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
358
359 if (dp)
360 {
361 val = DISP_CHAR_VECTOR (dp, c);
362 if (VECTORP (val))
363 thiswidth = XVECTOR (val)->size;
364 else
365 thiswidth = CHAR_WIDTH (c);
366 }
367 else
368 {
369 thiswidth = CHAR_WIDTH (c);
370 }
371
372 if (precision > 0
373 && (width + thiswidth > precision))
374 {
375 *nchars = i;
376 *nbytes = i_byte;
377 return width;
378 }
379 i++;
380 i_byte += bytes;
381 width += thiswidth;
382 }
383
384 if (precision > 0)
385 {
386 *nchars = i;
387 *nbytes = i_byte;
388 }
389
390 return width;
391}
392
1889b238
KH
393/* Return width of string STR of length LEN when displayed in the
394 current buffer. The width is measured by how many columns it
395 occupies on the screen. */
396
579c18d0
EZ
397EMACS_INT
398strwidth (const unsigned char *str, EMACS_INT len)
1889b238
KH
399{
400 return c_string_width (str, len, -1, NULL, NULL);
401}
402
0168c3d8
KH
403/* Return width of Lisp string STRING when displayed in the current
404 buffer. The width is measured by how many columns it occupies on
405 the screen while paying attention to compositions. If PRECISION >
406 0, return the width of longest substring that doesn't exceed
407 PRECISION, and set number of characters and bytes of the substring
408 in *NCHARS and *NBYTES respectively. */
409
579c18d0
EZ
410EMACS_INT
411lisp_string_width (Lisp_Object string, int precision,
412 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 413{
579c18d0 414 EMACS_INT len = SCHARS (string);
0aee65b9
KH
415 /* This set multibyte to 0 even if STRING is multibyte when it
416 contains only ascii and eight-bit-graphic, but that's
417 intentional. */
418 int multibyte = len < SBYTES (string);
8f924df7 419 unsigned char *str = SDATA (string);
579c18d0
EZ
420 EMACS_INT i = 0, i_byte = 0;
421 EMACS_INT width = 0;
0168c3d8
KH
422 struct Lisp_Char_Table *dp = buffer_display_table ();
423
424 while (i < len)
425 {
426 int chars, bytes, thiswidth;
427 Lisp_Object val;
428 int cmp_id;
f4bc0685 429 EMACS_INT ignore, end;
0168c3d8
KH
430
431 if (find_composition (i, -1, &ignore, &end, &val, string)
432 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
433 >= 0))
434 {
435 thiswidth = composition_table[cmp_id]->width;
436 chars = end - i;
437 bytes = string_char_to_byte (string, end) - i_byte;
438 }
0168c3d8
KH
439 else
440 {
0aee65b9 441 int c;
0168c3d8 442
0aee65b9 443 if (multibyte)
62a6e103 444 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0aee65b9
KH
445 else
446 c = str[i_byte], bytes = 1;
0168c3d8 447 chars = 1;
0aee65b9
KH
448 if (dp)
449 {
450 val = DISP_CHAR_VECTOR (dp, c);
451 if (VECTORP (val))
452 thiswidth = XVECTOR (val)->size;
453 else
454 thiswidth = CHAR_WIDTH (c);
455 }
456 else
457 {
458 thiswidth = CHAR_WIDTH (c);
459 }
0168c3d8
KH
460 }
461
462 if (precision > 0
463 && (width + thiswidth > precision))
464 {
465 *nchars = i;
466 *nbytes = i_byte;
467 return width;
468 }
469 i += chars;
470 i_byte += bytes;
471 width += thiswidth;
472 }
473
474 if (precision > 0)
475 {
476 *nchars = i;
477 *nbytes = i_byte;
478 }
479
480 return width;
481}
482
483DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
484 doc: /* Return width of STRING when displayed in the current buffer.
485Width is measured by how many columns it occupies on the screen.
486When calculating width of a multibyte character in STRING,
487only the base leading-code is considered; the validity of
488the following bytes is not checked. Tabs in STRING are always
add553ac
JB
489taken to occupy `tab-width' columns.
490usage: (string-width STRING) */)
5842a27b 491 (Lisp_Object str)
0168c3d8
KH
492{
493 Lisp_Object val;
494
495 CHECK_STRING (str);
496 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
497 return val;
498}
499
500DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
501 doc: /* Return the direction of CHAR.
add553ac
JB
502The returned value is 0 for left-to-right and 1 for right-to-left.
503usage: (char-direction CHAR) */)
5842a27b 504 (Lisp_Object ch)
0168c3d8
KH
505{
506 int c;
507
508 CHECK_CHARACTER (ch);
509 c = XINT (ch);
510 return CHAR_TABLE_REF (Vchar_direction_table, c);
511}
512
0168c3d8
KH
513/* Return the number of characters in the NBYTES bytes at PTR.
514 This works by looking at the contents and checking for multibyte
515 sequences while assuming that there's no invalid sequence.
516 However, if the current buffer has enable-multibyte-characters =
517 nil, we treat each byte as a character. */
518
13818c30 519EMACS_INT
971de7fb 520chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8
KH
521{
522 /* current_buffer is null at early stages of Emacs initialization. */
523 if (current_buffer == 0
524 || NILP (current_buffer->enable_multibyte_characters))
525 return nbytes;
526
527 return multibyte_chars_in_text (ptr, nbytes);
528}
529
530/* Return the number of characters in the NBYTES bytes at PTR.
531 This works by looking at the contents and checking for multibyte
532 sequences while assuming that there's no invalid sequence. It
533 ignores enable-multibyte-characters. */
534
13818c30 535EMACS_INT
971de7fb 536multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8 537{
8f924df7 538 const unsigned char *endp = ptr + nbytes;
579c18d0 539 EMACS_INT chars = 0;
0168c3d8
KH
540
541 while (ptr < endp)
542 {
579c18d0 543 EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
544
545 if (len == 0)
546 abort ();
547 ptr += len;
548 chars++;
549 }
550
551 return chars;
552}
553
554/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
555 characters and bytes in it, and store them in *NCHARS and *NBYTES
556 respectively. On counting bytes, pay attention to that 8-bit
557 characters not constructing a valid multibyte sequence are
558 represented by 2-byte in a multibyte text. */
559
560void
14162469
EZ
561parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
562 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 563{
8f924df7 564 const unsigned char *endp = str + len;
14162469 565 EMACS_INT n, chars = 0, bytes = 0;
0168c3d8
KH
566
567 if (len >= MAX_MULTIBYTE_LENGTH)
568 {
8f924df7 569 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
570 while (str < adjusted_endp)
571 {
fc9a17bc
KH
572 if (! CHAR_BYTE8_HEAD_P (*str)
573 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
574 str += n, bytes += n;
575 else
576 str++, bytes += 2;
577 chars++;
578 }
579 }
580 while (str < endp)
581 {
fc9a17bc
KH
582 if (! CHAR_BYTE8_HEAD_P (*str)
583 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
584 str += n, bytes += n;
585 else
586 str++, bytes += 2;
587 chars++;
588 }
589
590 *nchars = chars;
591 *nbytes = bytes;
592 return;
593}
594
595/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
596 It actually converts only such 8-bit characters that don't contruct
597 a multibyte sequence to multibyte forms of Latin-1 characters. If
598 NCHARS is nonzero, set *NCHARS to the number of characters in the
599 text. It is assured that we can use LEN bytes at STR as a work
600 area and that is enough. Return the number of bytes of the
601 resulting text. */
602
14162469
EZ
603EMACS_INT
604str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
605 EMACS_INT *nchars)
0168c3d8
KH
606{
607 unsigned char *p = str, *endp = str + nbytes;
608 unsigned char *to;
14162469 609 EMACS_INT chars = 0;
0168c3d8
KH
610 int n;
611
612 if (nbytes >= MAX_MULTIBYTE_LENGTH)
613 {
614 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
615 while (p < adjusted_endp
fc9a17bc 616 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
617 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
618 p += n, chars++;
619 }
fc9a17bc
KH
620 while (p < endp
621 && ! CHAR_BYTE8_HEAD_P (*p)
622 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
623 p += n, chars++;
624 if (nchars)
625 *nchars = chars;
626 if (p == endp)
627 return nbytes;
628
629 to = p;
630 nbytes = endp - p;
631 endp = str + len;
72af86bd 632 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
633 p = endp - nbytes;
634
635 if (nbytes >= MAX_MULTIBYTE_LENGTH)
636 {
637 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
638 while (p < adjusted_endp)
639 {
fc9a17bc
KH
640 if (! CHAR_BYTE8_HEAD_P (*p)
641 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
642 {
643 while (n--)
644 *to++ = *p++;
645 }
646 else
647 {
648 int c = *p++;
649 c = BYTE8_TO_CHAR (c);
650 to += CHAR_STRING (c, to);
651 }
652 }
653 chars++;
654 }
655 while (p < endp)
656 {
fc9a17bc
KH
657 if (! CHAR_BYTE8_HEAD_P (*p)
658 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
659 {
660 while (n--)
661 *to++ = *p++;
8f924df7 662 }
0168c3d8
KH
663 else
664 {
665 int c = *p++;
666 c = BYTE8_TO_CHAR (c);
667 to += CHAR_STRING (c, to);
668 }
669 chars++;
670 }
671 if (nchars)
672 *nchars = chars;
673 return (to - str);
674}
675
676/* Parse unibyte string at STR of LEN bytes, and return the number of
677 bytes it may ocupy when converted to multibyte string by
678 `str_to_multibyte'. */
679
14162469
EZ
680EMACS_INT
681parse_str_to_multibyte (const unsigned char *str, EMACS_INT len)
0168c3d8 682{
eec47d6b 683 const unsigned char *endp = str + len;
14162469 684 EMACS_INT bytes;
0168c3d8
KH
685
686 for (bytes = 0; str < endp; str++)
687 bytes += (*str < 0x80) ? 1 : 2;
688 return bytes;
689}
690
691
692/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
693 that contains the same single-byte characters. It actually
694 converts all 8-bit characters to multibyte forms. It is assured
695 that we can use LEN bytes at STR as a work area and that is
696 enough. */
697
14162469
EZ
698EMACS_INT
699str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
0168c3d8
KH
700{
701 unsigned char *p = str, *endp = str + bytes;
702 unsigned char *to;
703
704 while (p < endp && *p < 0x80) p++;
705 if (p == endp)
706 return bytes;
707 to = p;
708 bytes = endp - p;
709 endp = str + len;
72af86bd 710 memmove (endp - bytes, p, bytes);
0168c3d8 711 p = endp - bytes;
8f924df7 712 while (p < endp)
0168c3d8
KH
713 {
714 int c = *p++;
715
716 if (c >= 0x80)
717 c = BYTE8_TO_CHAR (c);
718 to += CHAR_STRING (c, to);
719 }
720 return (to - str);
721}
722
723/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
724 actually converts characters in the range 0x80..0xFF to
725 unibyte. */
726
14162469
EZ
727EMACS_INT
728str_as_unibyte (unsigned char *str, EMACS_INT bytes)
0168c3d8 729{
15843e6f
KH
730 const unsigned char *p = str, *endp = str + bytes;
731 unsigned char *to;
0168c3d8
KH
732 int c, len;
733
734 while (p < endp)
735 {
736 c = *p;
737 len = BYTES_BY_CHAR_HEAD (c);
738 if (CHAR_BYTE8_HEAD_P (c))
739 break;
740 p += len;
741 }
15843e6f 742 to = str + (p - str);
8f924df7 743 while (p < endp)
0168c3d8
KH
744 {
745 c = *p;
746 len = BYTES_BY_CHAR_HEAD (c);
747 if (CHAR_BYTE8_HEAD_P (c))
748 {
749 c = STRING_CHAR_ADVANCE (p);
750 *to++ = CHAR_TO_BYTE8 (c);
751 }
752 else
753 {
754 while (len--) *to++ = *p++;
755 }
756 }
757 return (to - str);
758}
759
4aa40bb8
KH
760/* Convert eight-bit chars in SRC (in multibyte form) to the
761 corresponding byte and store in DST. CHARS is the number of
762 characters in SRC. The value is the number of bytes stored in DST.
763 Usually, the value is the same as CHARS, but is less than it if SRC
764 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
765 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
766 of that character code.
767 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
768
769EMACS_INT
971de7fb 770str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
4aa40bb8
KH
771{
772 EMACS_INT i;
773
774 for (i = 0; i < chars; i++)
775 {
776 int c = STRING_CHAR_ADVANCE (src);
777
778 if (CHAR_BYTE8_P (c))
779 c = CHAR_TO_BYTE8 (c);
780 else if (! ASCII_CHAR_P (c)
781 && (! accept_latin_1 || c >= 0x100))
782 return i;
783 *dst++ = c;
784 }
785 return i;
786}
787
788
14162469 789EMACS_INT
971de7fb 790string_count_byte8 (Lisp_Object string)
0168c3d8
KH
791{
792 int multibyte = STRING_MULTIBYTE (string);
14162469 793 EMACS_INT nbytes = SBYTES (string);
8f924df7 794 unsigned char *p = SDATA (string);
0168c3d8 795 unsigned char *pend = p + nbytes;
14162469 796 EMACS_INT count = 0;
0168c3d8
KH
797 int c, len;
798
799 if (multibyte)
800 while (p < pend)
801 {
802 c = *p;
803 len = BYTES_BY_CHAR_HEAD (c);
804
805 if (CHAR_BYTE8_HEAD_P (c))
806 count++;
807 p += len;
808 }
809 else
810 while (p < pend)
811 {
812 if (*p++ >= 0x80)
813 count++;
814 }
815 return count;
816}
817
818
819Lisp_Object
971de7fb 820string_escape_byte8 (Lisp_Object string)
0168c3d8 821{
14162469
EZ
822 EMACS_INT nchars = SCHARS (string);
823 EMACS_INT nbytes = SBYTES (string);
0168c3d8 824 int multibyte = STRING_MULTIBYTE (string);
14162469 825 EMACS_INT byte8_count;
15843e6f
KH
826 const unsigned char *src, *src_end;
827 unsigned char *dst;
0168c3d8
KH
828 Lisp_Object val;
829 int c, len;
830
831 if (multibyte && nchars == nbytes)
832 return string;
833
834 byte8_count = string_count_byte8 (string);
835
836 if (byte8_count == 0)
837 return string;
838
839 if (multibyte)
14162469
EZ
840 {
841 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
842 || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
843 error ("Maximum string size exceeded");
844
845 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
846 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
847 nbytes + byte8_count * 2);
848 }
0168c3d8 849 else
14162469
EZ
850 {
851 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
852 error ("Maximum string size exceeded");
853 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
854 val = make_uninit_string (nbytes + byte8_count * 3);
855 }
0168c3d8 856
8f924df7 857 src = SDATA (string);
0168c3d8 858 src_end = src + nbytes;
8f924df7 859 dst = SDATA (val);
0168c3d8
KH
860 if (multibyte)
861 while (src < src_end)
862 {
863 c = *src;
864 len = BYTES_BY_CHAR_HEAD (c);
865
866 if (CHAR_BYTE8_HEAD_P (c))
867 {
868 c = STRING_CHAR_ADVANCE (src);
869 c = CHAR_TO_BYTE8 (c);
1889b238 870 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
871 dst += 4;
872 }
873 else
874 while (len--) *dst++ = *src++;
875 }
876 else
877 while (src < src_end)
878 {
879 c = *src++;
880 if (c >= 0x80)
881 {
1889b238 882 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
883 dst += 4;
884 }
885 else
886 *dst++ = c;
887 }
888 return val;
889}
890
891\f
8f924df7 892DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 893 doc: /*
d2e83296
DL
894Concatenate all the argument characters and make the result a string.
895usage: (string &rest CHARACTERS) */)
5842a27b 896 (int n, Lisp_Object *args)
0168c3d8 897{
754790b6
CY
898 int i, c;
899 unsigned char *buf, *p;
900 Lisp_Object str;
901 USE_SAFE_ALLOCA;
902
903 SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
904 p = buf;
0168c3d8
KH
905
906 for (i = 0; i < n; i++)
907 {
908 CHECK_CHARACTER (args[i]);
909 c = XINT (args[i]);
910 p += CHAR_STRING (c, p);
911 }
912
754790b6
CY
913 str = make_string_from_bytes ((char *) buf, n, p - buf);
914 SAFE_FREE ();
915 return str;
0168c3d8
KH
916}
917
70b4969d 918DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
919 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
920usage: (unibyte-string &rest BYTES) */)
5842a27b 921 (int n, Lisp_Object *args)
70b4969d 922{
754790b6
CY
923 int i, c;
924 unsigned char *buf, *p;
925 Lisp_Object str;
926 USE_SAFE_ALLOCA;
927
928 SAFE_ALLOCA (buf, unsigned char *, n);
929 p = buf;
70b4969d
KH
930
931 for (i = 0; i < n; i++)
932 {
933 CHECK_NATNUM (args[i]);
934 c = XFASTINT (args[i]);
935 if (c >= 256)
936 args_out_of_range_3 (args[i], make_number (0), make_number (255));
937 *p++ = c;
938 }
939
754790b6
CY
940 str = make_string_from_bytes ((char *) buf, n, p - buf);
941 SAFE_FREE ();
942 return str;
70b4969d
KH
943}
944
c73ae4ae 945DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
946 Schar_resolve_modifiers, 1, 1, 0,
947 doc: /* Resolve modifiers in the character CHAR.
948The value is a character with modifiers resolved into the character
949code. Unresolved modifiers are kept in the value.
c73ae4ae 950usage: (char-resolve-modifiers CHAR) */)
5842a27b 951 (Lisp_Object character)
d0363d44
KH
952{
953 int c;
954
955 CHECK_NUMBER (character);
956 c = XINT (character);
957 return make_number (char_resolve_modifier_mask (c));
958}
959
ee107a89
KH
960DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
961 doc: /* Return a byte value of a character at point.
962Optional 1st arg POSITION, if non-nil, is a position of a character to get
963a byte value.
964Optional 2nd arg STRING, if non-nil, is a string of which first
965character is a target to get a byte value. In this case, POSITION, if
966non-nil, is an index of a target character in the string.
967
968If the current buffer (or STRING) is multibyte, and the target
969character is not ASCII nor 8-bit character, an error is signalled. */)
5842a27b 970 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
971{
972 int c;
973 EMACS_INT pos;
974 unsigned char *p;
975
976 if (NILP (string))
977 {
978 if (NILP (position))
979 {
980 p = PT_ADDR;
4d8e170e 981 }
ee107a89
KH
982 else
983 {
984 CHECK_NUMBER_COERCE_MARKER (position);
985 if (XINT (position) < BEGV || XINT (position) >= ZV)
986 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
987 pos = XFASTINT (position);
988 p = CHAR_POS_ADDR (pos);
989 }
d5998e03
KH
990 if (NILP (current_buffer->enable_multibyte_characters))
991 return make_number (*p);
ee107a89
KH
992 }
993 else
994 {
995 CHECK_STRING (string);
996 if (NILP (position))
997 {
998 p = SDATA (string);
999 }
1000 else
1001 {
1002 CHECK_NATNUM (position);
1003 if (XINT (position) >= SCHARS (string))
1004 args_out_of_range (string, position);
1005 pos = XFASTINT (position);
1006 p = SDATA (string) + string_char_to_byte (string, pos);
1007 }
d5998e03
KH
1008 if (! STRING_MULTIBYTE (string))
1009 return make_number (*p);
ee107a89 1010 }
62a6e103 1011 c = STRING_CHAR (p);
ee107a89
KH
1012 if (CHAR_BYTE8_P (c))
1013 c = CHAR_TO_BYTE8 (c);
1014 else if (! ASCII_CHAR_P (c))
1015 error ("Not an ASCII nor an 8-bit character: %d", c);
1016 return make_number (c);
1017}
1018
1019
0168c3d8 1020void
971de7fb 1021init_character_once (void)
0168c3d8
KH
1022{
1023}
1024
1025#ifdef emacs
1026
1027void
971de7fb 1028syms_of_character (void)
0168c3d8
KH
1029{
1030 DEFSYM (Qcharacterp, "characterp");
1031 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1032
1033 staticpro (&Vchar_unify_table);
1034 Vchar_unify_table = Qnil;
1035
1036 defsubr (&Smax_char);
1037 defsubr (&Scharacterp);
1038 defsubr (&Sunibyte_char_to_multibyte);
1039 defsubr (&Smultibyte_char_to_unibyte);
0168c3d8
KH
1040 defsubr (&Schar_width);
1041 defsubr (&Sstring_width);
1042 defsubr (&Schar_direction);
0168c3d8 1043 defsubr (&Sstring);
70b4969d 1044 defsubr (&Sunibyte_string);
d0363d44 1045 defsubr (&Schar_resolve_modifiers);
ee107a89 1046 defsubr (&Sget_byte);
0168c3d8 1047
29208e82 1048 DEFVAR_LISP ("translation-table-vector", Vtranslation_table_vector,
0168c3d8 1049 doc: /*
68978cf0
DL
1050Vector recording all translation tables ever defined.
1051Each element is a pair (SYMBOL . TABLE) relating the table to the
1052symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1053 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1054
29208e82 1055 DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
0168c3d8
KH
1056 doc: /*
1057A char-table for characters which invoke auto-filling.
1058Such characters have value t in this table. */);
1059 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1060 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1061 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8 1062
29208e82 1063 DEFVAR_LISP ("char-width-table", Vchar_width_table,
0168c3d8
KH
1064 doc: /*
1065A char-table for width (columns) of each character. */);
1066 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1067 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1068 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1069 make_number (4));
0168c3d8 1070
29208e82 1071 DEFVAR_LISP ("char-direction-table", Vchar_direction_table,
0168c3d8
KH
1072 doc: /* A char-table for direction of each character. */);
1073 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1074
29208e82 1075 DEFVAR_LISP ("printable-chars", Vprintable_chars,
0168c3d8 1076 doc: /* A char-table for each printable character. */);
db6d4189 1077 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1078 Fset_char_table_range (Vprintable_chars,
1079 Fcons (make_number (32), make_number (126)), Qt);
1080 Fset_char_table_range (Vprintable_chars,
1081 Fcons (make_number (160),
1082 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1083
29208e82 1084 DEFVAR_LISP ("char-script-table", Vchar_script_table,
c57f3328
KH
1085 doc: /* Char table of script symbols.
1086It has one extra slot whose value is a list of script symbols. */);
1087
1088 /* Intern this now in case it isn't already done.
1089 Setting this variable twice is harmless.
1090 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1091 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1092 DEFSYM (Qchar_script_table, "char-script-table");
1093 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1094 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352 1095
29208e82 1096 DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
c3bb7671 1097 doc: /* Alist of scripts vs the representative characters.
f4427a54 1098Each element is a cons (SCRIPT . CHARS).
4d8e170e 1099SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1100CHARS is a list or a vector of characters.
472a4dc9 1101If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1102If it is a vector, one of the characters in the vector is necessary.
1103This variable is used to find a font for a specific script. */);
c7e14352 1104 Vscript_representative_chars = Qnil;
a3cbb631 1105
29208e82 1106 DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
a3cbb631 1107 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1108All Unicode characters have one of the following values (symbol):
1109 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1110 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1111See The Unicode Standard for the meaning of those values. */);
1112 /* The correct char-table is setup in characters.el. */
1113 Vunicode_category_table = Qnil;
0168c3d8
KH
1114}
1115
1116#endif /* emacs */