lisp.h: Fix a problem with aliasing and vector headers.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8 1/* Basic character support.
73b0cd50
GM
2
3Copyright (C) 2001-2011 Free Software Foundation, Inc.
4Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
5 Licensed to the Free Software Foundation.
6Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
0168c3d8
KH
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
955cbe7b 53static Lisp_Object Qauto_fill_chars;
0168c3d8 54
33f91981 55/* Char-table of information about which character to unify to which
6f1b43a0 56 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
57Lisp_Object Vchar_unify_table;
58
8973478b 59/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 60unsigned char *_fetch_multibyte_char_p;
0168c3d8 61
c57f3328
KH
62static Lisp_Object Qchar_script_table;
63
0168c3d8
KH
64\f
65
2bde7652
KH
66/* If character code C has modifier masks, reflect them to the
67 character code if possible. Return the resulting code. */
68
69int
971de7fb 70char_resolve_modifier_mask (int c)
2bde7652 71{
d0363d44 72 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
73 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
74 return c;
75
76 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
77 if (c & CHAR_SHIFT)
78 {
79 /* Shift modifier is valid only with [A-Za-z]. */
80 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
81 c &= ~CHAR_SHIFT;
82 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
83 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
84 /* Shift modifier for control characters and SPC is ignored. */
85 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
86 c &= ~CHAR_SHIFT;
87 }
2bde7652
KH
88 if (c & CHAR_CTL)
89 {
90 /* Simulate the code in lread.c. */
91 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
92 if ((c & 0377) == ' ')
93 c &= ~0177 & ~ CHAR_CTL;
94 else if ((c & 0377) == '?')
95 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
96 /* ASCII control chars are made from letters (both cases),
97 as well as the non-letters within 0100...0137. */
98 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
99 c &= (037 | (~0177 & ~CHAR_CTL));
100 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
101 c &= (037 | (~0177 & ~CHAR_CTL));
102 }
e9c1637d 103#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
104 if (c & CHAR_META)
105 {
106 /* Move the meta bit to the right place for a string. */
107 c = (c & ~CHAR_META) | 0x80;
108 }
e9c1637d 109#endif
2bde7652
KH
110
111 return c;
112}
113
114
33f91981
KH
115/* Store multibyte form of character C at P. If C has modifier bits,
116 handle them appropriately. */
117
0168c3d8 118int
971de7fb 119char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
120{
121 int bytes;
122
e3d8eb8c
KH
123 if (c & CHAR_MODIFIER_MASK)
124 {
2bde7652 125 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
126 /* If C still has any modifier bits, just ignore it. */
127 c &= ~CHAR_MODIFIER_MASK;
128 }
129
0168c3d8
KH
130 MAYBE_UNIFY_CHAR (c);
131
e3d8eb8c 132 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
133 {
134 bytes = CHAR_STRING (c, p);
135 }
136 else if (c <= MAX_4_BYTE_CHAR)
137 {
138 p[0] = (0xF0 | (c >> 18));
139 p[1] = (0x80 | ((c >> 12) & 0x3F));
140 p[2] = (0x80 | ((c >> 6) & 0x3F));
141 p[3] = (0x80 | (c & 0x3F));
142 bytes = 4;
143 }
e3d8eb8c 144 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
145 {
146 p[0] = 0xF8;
147 p[1] = (0x80 | ((c >> 18) & 0x0F));
148 p[2] = (0x80 | ((c >> 12) & 0x3F));
149 p[3] = (0x80 | ((c >> 6) & 0x3F));
150 p[4] = (0x80 | (c & 0x3F));
151 bytes = 5;
152 }
5aa91c9b 153 else if (c <= MAX_CHAR)
e3d8eb8c
KH
154 {
155 c = CHAR_TO_BYTE8 (c);
156 bytes = BYTE8_STRING (c, p);
157 }
5aa91c9b
KH
158 else
159 error ("Invalid character: %d", c);
1889b238 160
0168c3d8
KH
161 return bytes;
162}
163
164
33f91981
KH
165/* Return a character whose multibyte form is at P. Set LEN is not
166 NULL, it must be a pointer to integer. In that case, set *LEN to
167 the byte length of the multibyte form. If ADVANCED is not NULL, is
168 must be a pointer to unsigned char. In that case, set *ADVANCED to
169 the ending address (i.e. the starting address of the next
170 character) of the multibyte form. */
171
0168c3d8 172int
971de7fb 173string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 174{
1889b238 175 int c;
15843e6f 176 const unsigned char *saved_p = p;
0168c3d8
KH
177
178 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
179 {
180 c = STRING_CHAR_ADVANCE (p);
181 }
182 else if (! (*p & 0x08))
183 {
184 c = ((((p)[0] & 0xF) << 18)
185 | (((p)[1] & 0x3F) << 12)
186 | (((p)[2] & 0x3F) << 6)
187 | ((p)[3] & 0x3F));
188 p += 4;
189 }
190 else
191 {
192 c = ((((p)[1] & 0x3F) << 18)
193 | (((p)[2] & 0x3F) << 12)
194 | (((p)[3] & 0x3F) << 6)
195 | ((p)[4] & 0x3F));
196 p += 5;
197 }
198
199 MAYBE_UNIFY_CHAR (c);
200
201 if (len)
202 *len = p - saved_p;
203 if (advanced)
204 *advanced = p;
205 return c;
206}
207
208
209/* Translate character C by translation table TABLE. If C is
210 negative, translate a character specified by CHARSET and CODE. If
211 no translation is found in TABLE, return the untranslated
10453be9
KH
212 character. If TABLE is a list, elements are char tables. In this
213 case, translace C by all tables. */
0168c3d8
KH
214
215int
971de7fb 216translate_char (Lisp_Object table, int c)
0168c3d8 217{
10453be9
KH
218 if (CHAR_TABLE_P (table))
219 {
220 Lisp_Object ch;
221
222 ch = CHAR_TABLE_REF (table, c);
223 if (CHARACTERP (ch))
224 c = XINT (ch);
225 }
226 else
227 {
228 for (; CONSP (table); table = XCDR (table))
229 c = translate_char (XCAR (table), c);
230 }
231 return c;
0168c3d8
KH
232}
233
2e5db15c 234/* Convert ASCII or 8-bit character C to unibyte. If C is none of
461c2ab9 235 them, return (C & 0xFF). */
0168c3d8
KH
236
237int
461c2ab9 238multibyte_char_to_unibyte (int c)
0168c3d8 239{
2e5db15c
KH
240 if (c < 0x80)
241 return c;
b672c5ae
KH
242 if (CHAR_BYTE8_P (c))
243 return CHAR_TO_BYTE8 (c);
2e5db15c 244 return (c & 0xFF);
0168c3d8
KH
245}
246
935d5b02
KH
247/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
248 by charset_unibyte. */
249
250int
971de7fb 251multibyte_char_to_unibyte_safe (int c)
935d5b02 252{
2e5db15c
KH
253 if (c < 0x80)
254 return c;
935d5b02
KH
255 if (CHAR_BYTE8_P (c))
256 return CHAR_TO_BYTE8 (c);
2e5db15c 257 return -1;
935d5b02 258}
0168c3d8
KH
259
260DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
261 doc: /* Return non-nil if OBJECT is a character. */)
5842a27b 262 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
263{
264 return (CHARACTERP (object) ? Qt : Qnil);
265}
266
267DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
268 doc: /* Return the character of the maximum code. */)
5842a27b 269 (void)
0168c3d8
KH
270{
271 return make_number (MAX_CHAR);
272}
273
274DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
275 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 276 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 277 (Lisp_Object ch)
0168c3d8
KH
278{
279 int c;
0168c3d8
KH
280
281 CHECK_CHARACTER (ch);
282 c = XFASTINT (ch);
2e5db15c
KH
283 if (c >= 0x100)
284 error ("Not a unibyte character: %d", c);
4c0354d7 285 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
286 return make_number (c);
287}
288
289DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
290 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
291 doc: /* Convert the multibyte character CH to a byte.
292If the multibyte character does not represent a byte, return -1. */)
5842a27b 293 (Lisp_Object ch)
0168c3d8 294{
5556875b 295 int cm;
0168c3d8
KH
296
297 CHECK_CHARACTER (ch);
5556875b
SM
298 cm = XFASTINT (ch);
299 if (cm < 256)
300 /* Can't distinguish a byte read from a unibyte buffer from
301 a latin1 char, so let's let it slide. */
302 return ch;
303 else
304 {
2afc21f5 305 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
306 return make_number (cu);
307 }
0168c3d8
KH
308}
309
a7ca3326 310DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
0168c3d8
KH
311 doc: /* Return width of CHAR when displayed in the current buffer.
312The width is measured by how many columns it occupies on the screen.
add553ac
JB
313Tab is taken to occupy `tab-width' columns.
314usage: (char-width CHAR) */)
5842a27b 315 (Lisp_Object ch)
0168c3d8
KH
316{
317 Lisp_Object disp;
318 int c, width;
319 struct Lisp_Char_Table *dp = buffer_display_table ();
320
321 CHECK_CHARACTER (ch);
322 c = XINT (ch);
323
324 /* Get the way the display table would display it. */
325 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
326
327 if (VECTORP (disp))
328 width = ASIZE (disp);
329 else
330 width = CHAR_WIDTH (c);
331
332 return make_number (width);
333}
334
0168c3d8
KH
335/* Return width of string STR of length LEN when displayed in the
336 current buffer. The width is measured by how many columns it
337 occupies on the screen. If PRECISION > 0, return the width of
338 longest substring that doesn't exceed PRECISION, and set number of
339 characters and bytes of the substring in *NCHARS and *NBYTES
340 respectively. */
341
579c18d0
EZ
342EMACS_INT
343c_string_width (const unsigned char *str, EMACS_INT len, int precision,
344 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 345{
579c18d0
EZ
346 EMACS_INT i = 0, i_byte = 0;
347 EMACS_INT width = 0;
0168c3d8
KH
348 struct Lisp_Char_Table *dp = buffer_display_table ();
349
350 while (i_byte < len)
351 {
352 int bytes, thiswidth;
353 Lisp_Object val;
62a6e103 354 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
355
356 if (dp)
357 {
358 val = DISP_CHAR_VECTOR (dp, c);
359 if (VECTORP (val))
eab3844f 360 thiswidth = XVECTOR_SIZE (val);
0168c3d8
KH
361 else
362 thiswidth = CHAR_WIDTH (c);
363 }
364 else
365 {
366 thiswidth = CHAR_WIDTH (c);
367 }
368
369 if (precision > 0
370 && (width + thiswidth > precision))
371 {
372 *nchars = i;
373 *nbytes = i_byte;
374 return width;
375 }
376 i++;
377 i_byte += bytes;
378 width += thiswidth;
379 }
380
381 if (precision > 0)
382 {
383 *nchars = i;
384 *nbytes = i_byte;
385 }
386
387 return width;
388}
389
1889b238
KH
390/* Return width of string STR of length LEN when displayed in the
391 current buffer. The width is measured by how many columns it
392 occupies on the screen. */
393
579c18d0 394EMACS_INT
7469ef5d 395strwidth (const char *str, EMACS_INT len)
1889b238 396{
7469ef5d 397 return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
1889b238
KH
398}
399
0168c3d8
KH
400/* Return width of Lisp string STRING when displayed in the current
401 buffer. The width is measured by how many columns it occupies on
402 the screen while paying attention to compositions. If PRECISION >
403 0, return the width of longest substring that doesn't exceed
404 PRECISION, and set number of characters and bytes of the substring
405 in *NCHARS and *NBYTES respectively. */
406
579c18d0
EZ
407EMACS_INT
408lisp_string_width (Lisp_Object string, int precision,
409 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 410{
579c18d0 411 EMACS_INT len = SCHARS (string);
0aee65b9
KH
412 /* This set multibyte to 0 even if STRING is multibyte when it
413 contains only ascii and eight-bit-graphic, but that's
414 intentional. */
415 int multibyte = len < SBYTES (string);
8f924df7 416 unsigned char *str = SDATA (string);
579c18d0
EZ
417 EMACS_INT i = 0, i_byte = 0;
418 EMACS_INT width = 0;
0168c3d8
KH
419 struct Lisp_Char_Table *dp = buffer_display_table ();
420
421 while (i < len)
422 {
423 int chars, bytes, thiswidth;
424 Lisp_Object val;
425 int cmp_id;
f4bc0685 426 EMACS_INT ignore, end;
0168c3d8
KH
427
428 if (find_composition (i, -1, &ignore, &end, &val, string)
429 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
430 >= 0))
431 {
432 thiswidth = composition_table[cmp_id]->width;
433 chars = end - i;
434 bytes = string_char_to_byte (string, end) - i_byte;
435 }
0168c3d8
KH
436 else
437 {
0aee65b9 438 int c;
0168c3d8 439
0aee65b9 440 if (multibyte)
62a6e103 441 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0aee65b9
KH
442 else
443 c = str[i_byte], bytes = 1;
0168c3d8 444 chars = 1;
0aee65b9
KH
445 if (dp)
446 {
447 val = DISP_CHAR_VECTOR (dp, c);
448 if (VECTORP (val))
eab3844f 449 thiswidth = XVECTOR_SIZE (val);
0aee65b9
KH
450 else
451 thiswidth = CHAR_WIDTH (c);
452 }
453 else
454 {
455 thiswidth = CHAR_WIDTH (c);
456 }
0168c3d8
KH
457 }
458
459 if (precision > 0
460 && (width + thiswidth > precision))
461 {
462 *nchars = i;
463 *nbytes = i_byte;
464 return width;
465 }
466 i += chars;
467 i_byte += bytes;
468 width += thiswidth;
469 }
470
471 if (precision > 0)
472 {
473 *nchars = i;
474 *nbytes = i_byte;
475 }
476
477 return width;
478}
479
480DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
481 doc: /* Return width of STRING when displayed in the current buffer.
482Width is measured by how many columns it occupies on the screen.
483When calculating width of a multibyte character in STRING,
484only the base leading-code is considered; the validity of
485the following bytes is not checked. Tabs in STRING are always
add553ac
JB
486taken to occupy `tab-width' columns.
487usage: (string-width STRING) */)
5842a27b 488 (Lisp_Object str)
0168c3d8
KH
489{
490 Lisp_Object val;
491
492 CHECK_STRING (str);
493 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
494 return val;
495}
496
497DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
498 doc: /* Return the direction of CHAR.
add553ac
JB
499The returned value is 0 for left-to-right and 1 for right-to-left.
500usage: (char-direction CHAR) */)
5842a27b 501 (Lisp_Object ch)
0168c3d8
KH
502{
503 int c;
504
505 CHECK_CHARACTER (ch);
506 c = XINT (ch);
507 return CHAR_TABLE_REF (Vchar_direction_table, c);
508}
509
0168c3d8
KH
510/* Return the number of characters in the NBYTES bytes at PTR.
511 This works by looking at the contents and checking for multibyte
512 sequences while assuming that there's no invalid sequence.
513 However, if the current buffer has enable-multibyte-characters =
514 nil, we treat each byte as a character. */
515
13818c30 516EMACS_INT
971de7fb 517chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8
KH
518{
519 /* current_buffer is null at early stages of Emacs initialization. */
520 if (current_buffer == 0
4b4deea2 521 || NILP (BVAR (current_buffer, enable_multibyte_characters)))
0168c3d8
KH
522 return nbytes;
523
524 return multibyte_chars_in_text (ptr, nbytes);
525}
526
527/* Return the number of characters in the NBYTES bytes at PTR.
528 This works by looking at the contents and checking for multibyte
529 sequences while assuming that there's no invalid sequence. It
530 ignores enable-multibyte-characters. */
531
13818c30 532EMACS_INT
971de7fb 533multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8 534{
8f924df7 535 const unsigned char *endp = ptr + nbytes;
579c18d0 536 EMACS_INT chars = 0;
0168c3d8
KH
537
538 while (ptr < endp)
539 {
579c18d0 540 EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
541
542 if (len == 0)
543 abort ();
544 ptr += len;
545 chars++;
546 }
547
548 return chars;
549}
550
551/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
552 characters and bytes in it, and store them in *NCHARS and *NBYTES
553 respectively. On counting bytes, pay attention to that 8-bit
554 characters not constructing a valid multibyte sequence are
555 represented by 2-byte in a multibyte text. */
556
557void
14162469
EZ
558parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
559 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 560{
8f924df7 561 const unsigned char *endp = str + len;
14162469 562 EMACS_INT n, chars = 0, bytes = 0;
0168c3d8
KH
563
564 if (len >= MAX_MULTIBYTE_LENGTH)
565 {
8f924df7 566 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
567 while (str < adjusted_endp)
568 {
fc9a17bc
KH
569 if (! CHAR_BYTE8_HEAD_P (*str)
570 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
571 str += n, bytes += n;
572 else
573 str++, bytes += 2;
574 chars++;
575 }
576 }
577 while (str < endp)
578 {
fc9a17bc
KH
579 if (! CHAR_BYTE8_HEAD_P (*str)
580 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
581 str += n, bytes += n;
582 else
583 str++, bytes += 2;
584 chars++;
585 }
586
587 *nchars = chars;
588 *nbytes = bytes;
589 return;
590}
591
592/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
593 It actually converts only such 8-bit characters that don't contruct
594 a multibyte sequence to multibyte forms of Latin-1 characters. If
595 NCHARS is nonzero, set *NCHARS to the number of characters in the
596 text. It is assured that we can use LEN bytes at STR as a work
597 area and that is enough. Return the number of bytes of the
598 resulting text. */
599
14162469
EZ
600EMACS_INT
601str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
602 EMACS_INT *nchars)
0168c3d8
KH
603{
604 unsigned char *p = str, *endp = str + nbytes;
605 unsigned char *to;
14162469 606 EMACS_INT chars = 0;
0168c3d8
KH
607 int n;
608
609 if (nbytes >= MAX_MULTIBYTE_LENGTH)
610 {
611 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
612 while (p < adjusted_endp
fc9a17bc 613 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
614 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
615 p += n, chars++;
616 }
fc9a17bc
KH
617 while (p < endp
618 && ! CHAR_BYTE8_HEAD_P (*p)
619 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
620 p += n, chars++;
621 if (nchars)
622 *nchars = chars;
623 if (p == endp)
624 return nbytes;
625
626 to = p;
627 nbytes = endp - p;
628 endp = str + len;
72af86bd 629 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
630 p = endp - nbytes;
631
632 if (nbytes >= MAX_MULTIBYTE_LENGTH)
633 {
634 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
635 while (p < adjusted_endp)
636 {
fc9a17bc
KH
637 if (! CHAR_BYTE8_HEAD_P (*p)
638 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
639 {
640 while (n--)
641 *to++ = *p++;
642 }
643 else
644 {
645 int c = *p++;
646 c = BYTE8_TO_CHAR (c);
647 to += CHAR_STRING (c, to);
648 }
649 }
650 chars++;
651 }
652 while (p < endp)
653 {
fc9a17bc
KH
654 if (! CHAR_BYTE8_HEAD_P (*p)
655 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
656 {
657 while (n--)
658 *to++ = *p++;
8f924df7 659 }
0168c3d8
KH
660 else
661 {
662 int c = *p++;
663 c = BYTE8_TO_CHAR (c);
664 to += CHAR_STRING (c, to);
665 }
666 chars++;
667 }
668 if (nchars)
669 *nchars = chars;
670 return (to - str);
671}
672
673/* Parse unibyte string at STR of LEN bytes, and return the number of
674 bytes it may ocupy when converted to multibyte string by
675 `str_to_multibyte'. */
676
14162469
EZ
677EMACS_INT
678parse_str_to_multibyte (const unsigned char *str, EMACS_INT len)
0168c3d8 679{
eec47d6b 680 const unsigned char *endp = str + len;
14162469 681 EMACS_INT bytes;
0168c3d8
KH
682
683 for (bytes = 0; str < endp; str++)
684 bytes += (*str < 0x80) ? 1 : 2;
685 return bytes;
686}
687
688
689/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
690 that contains the same single-byte characters. It actually
691 converts all 8-bit characters to multibyte forms. It is assured
692 that we can use LEN bytes at STR as a work area and that is
693 enough. */
694
14162469
EZ
695EMACS_INT
696str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
0168c3d8
KH
697{
698 unsigned char *p = str, *endp = str + bytes;
699 unsigned char *to;
700
701 while (p < endp && *p < 0x80) p++;
702 if (p == endp)
703 return bytes;
704 to = p;
705 bytes = endp - p;
706 endp = str + len;
72af86bd 707 memmove (endp - bytes, p, bytes);
0168c3d8 708 p = endp - bytes;
8f924df7 709 while (p < endp)
0168c3d8
KH
710 {
711 int c = *p++;
712
713 if (c >= 0x80)
714 c = BYTE8_TO_CHAR (c);
715 to += CHAR_STRING (c, to);
716 }
717 return (to - str);
718}
719
720/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
721 actually converts characters in the range 0x80..0xFF to
722 unibyte. */
723
14162469
EZ
724EMACS_INT
725str_as_unibyte (unsigned char *str, EMACS_INT bytes)
0168c3d8 726{
15843e6f
KH
727 const unsigned char *p = str, *endp = str + bytes;
728 unsigned char *to;
0168c3d8
KH
729 int c, len;
730
731 while (p < endp)
732 {
733 c = *p;
734 len = BYTES_BY_CHAR_HEAD (c);
735 if (CHAR_BYTE8_HEAD_P (c))
736 break;
737 p += len;
738 }
15843e6f 739 to = str + (p - str);
8f924df7 740 while (p < endp)
0168c3d8
KH
741 {
742 c = *p;
743 len = BYTES_BY_CHAR_HEAD (c);
744 if (CHAR_BYTE8_HEAD_P (c))
745 {
746 c = STRING_CHAR_ADVANCE (p);
747 *to++ = CHAR_TO_BYTE8 (c);
748 }
749 else
750 {
751 while (len--) *to++ = *p++;
752 }
753 }
754 return (to - str);
755}
756
4aa40bb8
KH
757/* Convert eight-bit chars in SRC (in multibyte form) to the
758 corresponding byte and store in DST. CHARS is the number of
759 characters in SRC. The value is the number of bytes stored in DST.
760 Usually, the value is the same as CHARS, but is less than it if SRC
8307f923 761 contains a non-ASCII, non-eight-bit character. If ACCEPT_LATIN_1
4aa40bb8 762 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
763 of that character code.
764 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
765
766EMACS_INT
971de7fb 767str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
4aa40bb8
KH
768{
769 EMACS_INT i;
770
771 for (i = 0; i < chars; i++)
772 {
773 int c = STRING_CHAR_ADVANCE (src);
774
775 if (CHAR_BYTE8_P (c))
776 c = CHAR_TO_BYTE8 (c);
777 else if (! ASCII_CHAR_P (c)
778 && (! accept_latin_1 || c >= 0x100))
779 return i;
780 *dst++ = c;
781 }
782 return i;
783}
784
785
d0891610 786static EMACS_INT
971de7fb 787string_count_byte8 (Lisp_Object string)
0168c3d8
KH
788{
789 int multibyte = STRING_MULTIBYTE (string);
14162469 790 EMACS_INT nbytes = SBYTES (string);
8f924df7 791 unsigned char *p = SDATA (string);
0168c3d8 792 unsigned char *pend = p + nbytes;
14162469 793 EMACS_INT count = 0;
0168c3d8
KH
794 int c, len;
795
796 if (multibyte)
797 while (p < pend)
798 {
799 c = *p;
800 len = BYTES_BY_CHAR_HEAD (c);
801
802 if (CHAR_BYTE8_HEAD_P (c))
803 count++;
804 p += len;
805 }
806 else
807 while (p < pend)
808 {
809 if (*p++ >= 0x80)
810 count++;
811 }
812 return count;
813}
814
815
816Lisp_Object
971de7fb 817string_escape_byte8 (Lisp_Object string)
0168c3d8 818{
14162469
EZ
819 EMACS_INT nchars = SCHARS (string);
820 EMACS_INT nbytes = SBYTES (string);
0168c3d8 821 int multibyte = STRING_MULTIBYTE (string);
14162469 822 EMACS_INT byte8_count;
15843e6f
KH
823 const unsigned char *src, *src_end;
824 unsigned char *dst;
0168c3d8
KH
825 Lisp_Object val;
826 int c, len;
827
828 if (multibyte && nchars == nbytes)
829 return string;
830
831 byte8_count = string_count_byte8 (string);
832
833 if (byte8_count == 0)
834 return string;
835
836 if (multibyte)
14162469
EZ
837 {
838 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
839 || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
840 error ("Maximum string size exceeded");
841
842 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
843 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
844 nbytes + byte8_count * 2);
845 }
0168c3d8 846 else
14162469
EZ
847 {
848 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
849 error ("Maximum string size exceeded");
850 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
851 val = make_uninit_string (nbytes + byte8_count * 3);
852 }
0168c3d8 853
8f924df7 854 src = SDATA (string);
0168c3d8 855 src_end = src + nbytes;
8f924df7 856 dst = SDATA (val);
0168c3d8
KH
857 if (multibyte)
858 while (src < src_end)
859 {
860 c = *src;
861 len = BYTES_BY_CHAR_HEAD (c);
862
863 if (CHAR_BYTE8_HEAD_P (c))
864 {
865 c = STRING_CHAR_ADVANCE (src);
866 c = CHAR_TO_BYTE8 (c);
1889b238 867 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
868 dst += 4;
869 }
870 else
871 while (len--) *dst++ = *src++;
872 }
873 else
874 while (src < src_end)
875 {
876 c = *src++;
877 if (c >= 0x80)
878 {
1889b238 879 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
880 dst += 4;
881 }
882 else
883 *dst++ = c;
884 }
885 return val;
886}
887
888\f
a7ca3326 889DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 890 doc: /*
d2e83296
DL
891Concatenate all the argument characters and make the result a string.
892usage: (string &rest CHARACTERS) */)
c5101a77 893 (size_t n, Lisp_Object *args)
0168c3d8 894{
c5101a77
PE
895 size_t i;
896 int c;
754790b6
CY
897 unsigned char *buf, *p;
898 Lisp_Object str;
899 USE_SAFE_ALLOCA;
900
901 SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
902 p = buf;
0168c3d8
KH
903
904 for (i = 0; i < n; i++)
905 {
906 CHECK_CHARACTER (args[i]);
907 c = XINT (args[i]);
908 p += CHAR_STRING (c, p);
909 }
910
754790b6
CY
911 str = make_string_from_bytes ((char *) buf, n, p - buf);
912 SAFE_FREE ();
913 return str;
0168c3d8
KH
914}
915
70b4969d 916DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
917 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
918usage: (unibyte-string &rest BYTES) */)
c5101a77 919 (size_t n, Lisp_Object *args)
70b4969d 920{
c5101a77
PE
921 size_t i;
922 int c;
754790b6
CY
923 unsigned char *buf, *p;
924 Lisp_Object str;
925 USE_SAFE_ALLOCA;
926
927 SAFE_ALLOCA (buf, unsigned char *, n);
928 p = buf;
70b4969d
KH
929
930 for (i = 0; i < n; i++)
931 {
932 CHECK_NATNUM (args[i]);
933 c = XFASTINT (args[i]);
934 if (c >= 256)
935 args_out_of_range_3 (args[i], make_number (0), make_number (255));
936 *p++ = c;
937 }
938
754790b6
CY
939 str = make_string_from_bytes ((char *) buf, n, p - buf);
940 SAFE_FREE ();
941 return str;
70b4969d
KH
942}
943
c73ae4ae 944DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
945 Schar_resolve_modifiers, 1, 1, 0,
946 doc: /* Resolve modifiers in the character CHAR.
947The value is a character with modifiers resolved into the character
948code. Unresolved modifiers are kept in the value.
c73ae4ae 949usage: (char-resolve-modifiers CHAR) */)
5842a27b 950 (Lisp_Object character)
d0363d44
KH
951{
952 int c;
953
954 CHECK_NUMBER (character);
955 c = XINT (character);
956 return make_number (char_resolve_modifier_mask (c));
957}
958
ee107a89
KH
959DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
960 doc: /* Return a byte value of a character at point.
961Optional 1st arg POSITION, if non-nil, is a position of a character to get
962a byte value.
963Optional 2nd arg STRING, if non-nil, is a string of which first
964character is a target to get a byte value. In this case, POSITION, if
965non-nil, is an index of a target character in the string.
966
967If the current buffer (or STRING) is multibyte, and the target
968character is not ASCII nor 8-bit character, an error is signalled. */)
5842a27b 969 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
970{
971 int c;
972 EMACS_INT pos;
973 unsigned char *p;
974
975 if (NILP (string))
976 {
977 if (NILP (position))
978 {
979 p = PT_ADDR;
4d8e170e 980 }
ee107a89
KH
981 else
982 {
983 CHECK_NUMBER_COERCE_MARKER (position);
984 if (XINT (position) < BEGV || XINT (position) >= ZV)
985 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
986 pos = XFASTINT (position);
987 p = CHAR_POS_ADDR (pos);
988 }
4b4deea2 989 if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
d5998e03 990 return make_number (*p);
ee107a89
KH
991 }
992 else
993 {
994 CHECK_STRING (string);
995 if (NILP (position))
996 {
997 p = SDATA (string);
998 }
999 else
1000 {
1001 CHECK_NATNUM (position);
1002 if (XINT (position) >= SCHARS (string))
1003 args_out_of_range (string, position);
1004 pos = XFASTINT (position);
1005 p = SDATA (string) + string_char_to_byte (string, pos);
1006 }
d5998e03
KH
1007 if (! STRING_MULTIBYTE (string))
1008 return make_number (*p);
ee107a89 1009 }
62a6e103 1010 c = STRING_CHAR (p);
ee107a89
KH
1011 if (CHAR_BYTE8_P (c))
1012 c = CHAR_TO_BYTE8 (c);
1013 else if (! ASCII_CHAR_P (c))
1014 error ("Not an ASCII nor an 8-bit character: %d", c);
1015 return make_number (c);
1016}
1017
1018
0168c3d8 1019void
971de7fb 1020init_character_once (void)
0168c3d8
KH
1021{
1022}
1023
1024#ifdef emacs
1025
1026void
971de7fb 1027syms_of_character (void)
0168c3d8
KH
1028{
1029 DEFSYM (Qcharacterp, "characterp");
1030 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1031
1032 staticpro (&Vchar_unify_table);
1033 Vchar_unify_table = Qnil;
1034
1035 defsubr (&Smax_char);
1036 defsubr (&Scharacterp);
1037 defsubr (&Sunibyte_char_to_multibyte);
1038 defsubr (&Smultibyte_char_to_unibyte);
0168c3d8
KH
1039 defsubr (&Schar_width);
1040 defsubr (&Sstring_width);
1041 defsubr (&Schar_direction);
0168c3d8 1042 defsubr (&Sstring);
70b4969d 1043 defsubr (&Sunibyte_string);
d0363d44 1044 defsubr (&Schar_resolve_modifiers);
ee107a89 1045 defsubr (&Sget_byte);
0168c3d8 1046
29208e82 1047 DEFVAR_LISP ("translation-table-vector", Vtranslation_table_vector,
0168c3d8 1048 doc: /*
68978cf0
DL
1049Vector recording all translation tables ever defined.
1050Each element is a pair (SYMBOL . TABLE) relating the table to the
1051symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1052 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1053
29208e82 1054 DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
0168c3d8
KH
1055 doc: /*
1056A char-table for characters which invoke auto-filling.
1057Such characters have value t in this table. */);
1058 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1059 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1060 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8 1061
29208e82 1062 DEFVAR_LISP ("char-width-table", Vchar_width_table,
0168c3d8
KH
1063 doc: /*
1064A char-table for width (columns) of each character. */);
1065 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1066 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1067 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1068 make_number (4));
0168c3d8 1069
29208e82 1070 DEFVAR_LISP ("char-direction-table", Vchar_direction_table,
0168c3d8
KH
1071 doc: /* A char-table for direction of each character. */);
1072 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1073
29208e82 1074 DEFVAR_LISP ("printable-chars", Vprintable_chars,
0168c3d8 1075 doc: /* A char-table for each printable character. */);
db6d4189 1076 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1077 Fset_char_table_range (Vprintable_chars,
1078 Fcons (make_number (32), make_number (126)), Qt);
1079 Fset_char_table_range (Vprintable_chars,
1080 Fcons (make_number (160),
1081 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1082
29208e82 1083 DEFVAR_LISP ("char-script-table", Vchar_script_table,
c57f3328
KH
1084 doc: /* Char table of script symbols.
1085It has one extra slot whose value is a list of script symbols. */);
1086
1087 /* Intern this now in case it isn't already done.
1088 Setting this variable twice is harmless.
1089 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1090 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1091 DEFSYM (Qchar_script_table, "char-script-table");
1092 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1093 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352 1094
29208e82 1095 DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
c3bb7671 1096 doc: /* Alist of scripts vs the representative characters.
f4427a54 1097Each element is a cons (SCRIPT . CHARS).
4d8e170e 1098SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1099CHARS is a list or a vector of characters.
472a4dc9 1100If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1101If it is a vector, one of the characters in the vector is necessary.
1102This variable is used to find a font for a specific script. */);
c7e14352 1103 Vscript_representative_chars = Qnil;
a3cbb631 1104
29208e82 1105 DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
a3cbb631 1106 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1107All Unicode characters have one of the following values (symbol):
1108 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1109 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1110See The Unicode Standard for the meaning of those values. */);
1111 /* The correct char-table is setup in characters.el. */
1112 Vunicode_category_table = Qnil;
0168c3d8
KH
1113}
1114
1115#endif /* emacs */