Merge from trunk
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
114f9c96 4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
ec62e0ac 5 Free Software Foundation, Inc.
114f9c96 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
0168c3d8
KH
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
53/* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55Lisp_Object Vtranslation_table_vector;
56
57/* A char-table for characters which may invoke auto-filling. */
58Lisp_Object Vauto_fill_chars;
59
60Lisp_Object Qauto_fill_chars;
61
33f91981 62/* Char-table of information about which character to unify to which
6f1b43a0 63 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
64Lisp_Object Vchar_unify_table;
65
66/* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68Lisp_Object Vprintable_chars;
69
70/* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72Lisp_Object Vchar_width_table;
73
74/* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76Lisp_Object Vchar_direction_table;
77
8973478b 78/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 79unsigned char *_fetch_multibyte_char_p;
0168c3d8 80
c57f3328
KH
81/* Char table of scripts. */
82Lisp_Object Vchar_script_table;
83
c7e14352
KH
84/* Alist of scripts vs representative characters. */
85Lisp_Object Vscript_representative_chars;
86
c57f3328
KH
87static Lisp_Object Qchar_script_table;
88
a3cbb631 89Lisp_Object Vunicode_category_table;
0168c3d8
KH
90\f
91
2bde7652
KH
92/* If character code C has modifier masks, reflect them to the
93 character code if possible. Return the resulting code. */
94
95int
971de7fb 96char_resolve_modifier_mask (int c)
2bde7652 97{
d0363d44 98 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
99 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
100 return c;
101
102 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
103 if (c & CHAR_SHIFT)
104 {
105 /* Shift modifier is valid only with [A-Za-z]. */
106 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
107 c &= ~CHAR_SHIFT;
108 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
109 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
110 /* Shift modifier for control characters and SPC is ignored. */
111 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
112 c &= ~CHAR_SHIFT;
113 }
2bde7652
KH
114 if (c & CHAR_CTL)
115 {
116 /* Simulate the code in lread.c. */
117 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
118 if ((c & 0377) == ' ')
119 c &= ~0177 & ~ CHAR_CTL;
120 else if ((c & 0377) == '?')
121 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
122 /* ASCII control chars are made from letters (both cases),
123 as well as the non-letters within 0100...0137. */
124 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
125 c &= (037 | (~0177 & ~CHAR_CTL));
126 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
127 c &= (037 | (~0177 & ~CHAR_CTL));
128 }
e9c1637d 129#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
130 if (c & CHAR_META)
131 {
132 /* Move the meta bit to the right place for a string. */
133 c = (c & ~CHAR_META) | 0x80;
134 }
e9c1637d 135#endif
2bde7652
KH
136
137 return c;
138}
139
140
33f91981
KH
141/* Store multibyte form of character C at P. If C has modifier bits,
142 handle them appropriately. */
143
0168c3d8 144int
971de7fb 145char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
146{
147 int bytes;
148
e3d8eb8c
KH
149 if (c & CHAR_MODIFIER_MASK)
150 {
2bde7652 151 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
152 /* If C still has any modifier bits, just ignore it. */
153 c &= ~CHAR_MODIFIER_MASK;
154 }
155
0168c3d8
KH
156 MAYBE_UNIFY_CHAR (c);
157
e3d8eb8c 158 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
159 {
160 bytes = CHAR_STRING (c, p);
161 }
162 else if (c <= MAX_4_BYTE_CHAR)
163 {
164 p[0] = (0xF0 | (c >> 18));
165 p[1] = (0x80 | ((c >> 12) & 0x3F));
166 p[2] = (0x80 | ((c >> 6) & 0x3F));
167 p[3] = (0x80 | (c & 0x3F));
168 bytes = 4;
169 }
e3d8eb8c 170 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
171 {
172 p[0] = 0xF8;
173 p[1] = (0x80 | ((c >> 18) & 0x0F));
174 p[2] = (0x80 | ((c >> 12) & 0x3F));
175 p[3] = (0x80 | ((c >> 6) & 0x3F));
176 p[4] = (0x80 | (c & 0x3F));
177 bytes = 5;
178 }
5aa91c9b 179 else if (c <= MAX_CHAR)
e3d8eb8c
KH
180 {
181 c = CHAR_TO_BYTE8 (c);
182 bytes = BYTE8_STRING (c, p);
183 }
5aa91c9b
KH
184 else
185 error ("Invalid character: %d", c);
1889b238 186
0168c3d8
KH
187 return bytes;
188}
189
190
33f91981
KH
191/* Return a character whose multibyte form is at P. Set LEN is not
192 NULL, it must be a pointer to integer. In that case, set *LEN to
193 the byte length of the multibyte form. If ADVANCED is not NULL, is
194 must be a pointer to unsigned char. In that case, set *ADVANCED to
195 the ending address (i.e. the starting address of the next
196 character) of the multibyte form. */
197
0168c3d8 198int
971de7fb 199string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 200{
1889b238 201 int c;
15843e6f 202 const unsigned char *saved_p = p;
0168c3d8
KH
203
204 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
205 {
206 c = STRING_CHAR_ADVANCE (p);
207 }
208 else if (! (*p & 0x08))
209 {
210 c = ((((p)[0] & 0xF) << 18)
211 | (((p)[1] & 0x3F) << 12)
212 | (((p)[2] & 0x3F) << 6)
213 | ((p)[3] & 0x3F));
214 p += 4;
215 }
216 else
217 {
218 c = ((((p)[1] & 0x3F) << 18)
219 | (((p)[2] & 0x3F) << 12)
220 | (((p)[3] & 0x3F) << 6)
221 | ((p)[4] & 0x3F));
222 p += 5;
223 }
224
225 MAYBE_UNIFY_CHAR (c);
226
227 if (len)
228 *len = p - saved_p;
229 if (advanced)
230 *advanced = p;
231 return c;
232}
233
234
235/* Translate character C by translation table TABLE. If C is
236 negative, translate a character specified by CHARSET and CODE. If
237 no translation is found in TABLE, return the untranslated
10453be9
KH
238 character. If TABLE is a list, elements are char tables. In this
239 case, translace C by all tables. */
0168c3d8
KH
240
241int
971de7fb 242translate_char (Lisp_Object table, int c)
0168c3d8 243{
10453be9
KH
244 if (CHAR_TABLE_P (table))
245 {
246 Lisp_Object ch;
247
248 ch = CHAR_TABLE_REF (table, c);
249 if (CHARACTERP (ch))
250 c = XINT (ch);
251 }
252 else
253 {
254 for (; CONSP (table); table = XCDR (table))
255 c = translate_char (XCAR (table), c);
256 }
257 return c;
0168c3d8
KH
258}
259
2e5db15c
KH
260/* Convert ASCII or 8-bit character C to unibyte. If C is none of
261 them, return (C & 0xFF).
0168c3d8
KH
262
263 The argument REV_TBL is now ignored. It will be removed in the
264 future. */
265
266int
971de7fb 267multibyte_char_to_unibyte (int c, Lisp_Object rev_tbl)
0168c3d8 268{
2e5db15c
KH
269 if (c < 0x80)
270 return c;
b672c5ae
KH
271 if (CHAR_BYTE8_P (c))
272 return CHAR_TO_BYTE8 (c);
2e5db15c 273 return (c & 0xFF);
0168c3d8
KH
274}
275
935d5b02
KH
276/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
277 by charset_unibyte. */
278
279int
971de7fb 280multibyte_char_to_unibyte_safe (int c)
935d5b02 281{
2e5db15c
KH
282 if (c < 0x80)
283 return c;
935d5b02
KH
284 if (CHAR_BYTE8_P (c))
285 return CHAR_TO_BYTE8 (c);
2e5db15c 286 return -1;
935d5b02 287}
0168c3d8
KH
288
289DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
290 doc: /* Return non-nil if OBJECT is a character. */)
5842a27b 291 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
292{
293 return (CHARACTERP (object) ? Qt : Qnil);
294}
295
296DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
297 doc: /* Return the character of the maximum code. */)
5842a27b 298 (void)
0168c3d8
KH
299{
300 return make_number (MAX_CHAR);
301}
302
303DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
304 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 305 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 306 (Lisp_Object ch)
0168c3d8
KH
307{
308 int c;
0168c3d8
KH
309
310 CHECK_CHARACTER (ch);
311 c = XFASTINT (ch);
2e5db15c
KH
312 if (c >= 0x100)
313 error ("Not a unibyte character: %d", c);
4c0354d7 314 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
315 return make_number (c);
316}
317
318DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
319 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
320 doc: /* Convert the multibyte character CH to a byte.
321If the multibyte character does not represent a byte, return -1. */)
5842a27b 322 (Lisp_Object ch)
0168c3d8 323{
5556875b 324 int cm;
0168c3d8
KH
325
326 CHECK_CHARACTER (ch);
5556875b
SM
327 cm = XFASTINT (ch);
328 if (cm < 256)
329 /* Can't distinguish a byte read from a unibyte buffer from
330 a latin1 char, so let's let it slide. */
331 return ch;
332 else
333 {
2afc21f5 334 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
335 return make_number (cu);
336 }
0168c3d8
KH
337}
338
0168c3d8
KH
339DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
340 doc: /* Return width of CHAR when displayed in the current buffer.
341The width is measured by how many columns it occupies on the screen.
add553ac
JB
342Tab is taken to occupy `tab-width' columns.
343usage: (char-width CHAR) */)
5842a27b 344 (Lisp_Object ch)
0168c3d8
KH
345{
346 Lisp_Object disp;
347 int c, width;
348 struct Lisp_Char_Table *dp = buffer_display_table ();
349
350 CHECK_CHARACTER (ch);
351 c = XINT (ch);
352
353 /* Get the way the display table would display it. */
354 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
355
356 if (VECTORP (disp))
357 width = ASIZE (disp);
358 else
359 width = CHAR_WIDTH (c);
360
361 return make_number (width);
362}
363
0168c3d8
KH
364/* Return width of string STR of length LEN when displayed in the
365 current buffer. The width is measured by how many columns it
366 occupies on the screen. If PRECISION > 0, return the width of
367 longest substring that doesn't exceed PRECISION, and set number of
368 characters and bytes of the substring in *NCHARS and *NBYTES
369 respectively. */
370
579c18d0
EZ
371EMACS_INT
372c_string_width (const unsigned char *str, EMACS_INT len, int precision,
373 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 374{
579c18d0
EZ
375 EMACS_INT i = 0, i_byte = 0;
376 EMACS_INT width = 0;
0168c3d8
KH
377 struct Lisp_Char_Table *dp = buffer_display_table ();
378
379 while (i_byte < len)
380 {
381 int bytes, thiswidth;
382 Lisp_Object val;
62a6e103 383 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
384
385 if (dp)
386 {
387 val = DISP_CHAR_VECTOR (dp, c);
388 if (VECTORP (val))
389 thiswidth = XVECTOR (val)->size;
390 else
391 thiswidth = CHAR_WIDTH (c);
392 }
393 else
394 {
395 thiswidth = CHAR_WIDTH (c);
396 }
397
398 if (precision > 0
399 && (width + thiswidth > precision))
400 {
401 *nchars = i;
402 *nbytes = i_byte;
403 return width;
404 }
405 i++;
406 i_byte += bytes;
407 width += thiswidth;
408 }
409
410 if (precision > 0)
411 {
412 *nchars = i;
413 *nbytes = i_byte;
414 }
415
416 return width;
417}
418
1889b238
KH
419/* Return width of string STR of length LEN when displayed in the
420 current buffer. The width is measured by how many columns it
421 occupies on the screen. */
422
579c18d0
EZ
423EMACS_INT
424strwidth (const unsigned char *str, EMACS_INT len)
1889b238
KH
425{
426 return c_string_width (str, len, -1, NULL, NULL);
427}
428
0168c3d8
KH
429/* Return width of Lisp string STRING when displayed in the current
430 buffer. The width is measured by how many columns it occupies on
431 the screen while paying attention to compositions. If PRECISION >
432 0, return the width of longest substring that doesn't exceed
433 PRECISION, and set number of characters and bytes of the substring
434 in *NCHARS and *NBYTES respectively. */
435
579c18d0
EZ
436EMACS_INT
437lisp_string_width (Lisp_Object string, int precision,
438 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 439{
579c18d0 440 EMACS_INT len = SCHARS (string);
0aee65b9
KH
441 /* This set multibyte to 0 even if STRING is multibyte when it
442 contains only ascii and eight-bit-graphic, but that's
443 intentional. */
444 int multibyte = len < SBYTES (string);
8f924df7 445 unsigned char *str = SDATA (string);
579c18d0
EZ
446 EMACS_INT i = 0, i_byte = 0;
447 EMACS_INT width = 0;
0168c3d8
KH
448 struct Lisp_Char_Table *dp = buffer_display_table ();
449
450 while (i < len)
451 {
452 int chars, bytes, thiswidth;
453 Lisp_Object val;
454 int cmp_id;
f4bc0685 455 EMACS_INT ignore, end;
0168c3d8
KH
456
457 if (find_composition (i, -1, &ignore, &end, &val, string)
458 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
459 >= 0))
460 {
461 thiswidth = composition_table[cmp_id]->width;
462 chars = end - i;
463 bytes = string_char_to_byte (string, end) - i_byte;
464 }
0168c3d8
KH
465 else
466 {
0aee65b9 467 int c;
0168c3d8 468
0aee65b9 469 if (multibyte)
62a6e103 470 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0aee65b9
KH
471 else
472 c = str[i_byte], bytes = 1;
0168c3d8 473 chars = 1;
0aee65b9
KH
474 if (dp)
475 {
476 val = DISP_CHAR_VECTOR (dp, c);
477 if (VECTORP (val))
478 thiswidth = XVECTOR (val)->size;
479 else
480 thiswidth = CHAR_WIDTH (c);
481 }
482 else
483 {
484 thiswidth = CHAR_WIDTH (c);
485 }
0168c3d8
KH
486 }
487
488 if (precision > 0
489 && (width + thiswidth > precision))
490 {
491 *nchars = i;
492 *nbytes = i_byte;
493 return width;
494 }
495 i += chars;
496 i_byte += bytes;
497 width += thiswidth;
498 }
499
500 if (precision > 0)
501 {
502 *nchars = i;
503 *nbytes = i_byte;
504 }
505
506 return width;
507}
508
509DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
510 doc: /* Return width of STRING when displayed in the current buffer.
511Width is measured by how many columns it occupies on the screen.
512When calculating width of a multibyte character in STRING,
513only the base leading-code is considered; the validity of
514the following bytes is not checked. Tabs in STRING are always
add553ac
JB
515taken to occupy `tab-width' columns.
516usage: (string-width STRING) */)
5842a27b 517 (Lisp_Object str)
0168c3d8
KH
518{
519 Lisp_Object val;
520
521 CHECK_STRING (str);
522 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
523 return val;
524}
525
526DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
527 doc: /* Return the direction of CHAR.
add553ac
JB
528The returned value is 0 for left-to-right and 1 for right-to-left.
529usage: (char-direction CHAR) */)
5842a27b 530 (Lisp_Object ch)
0168c3d8
KH
531{
532 int c;
533
534 CHECK_CHARACTER (ch);
535 c = XINT (ch);
536 return CHAR_TABLE_REF (Vchar_direction_table, c);
537}
538
0168c3d8
KH
539/* Return the number of characters in the NBYTES bytes at PTR.
540 This works by looking at the contents and checking for multibyte
541 sequences while assuming that there's no invalid sequence.
542 However, if the current buffer has enable-multibyte-characters =
543 nil, we treat each byte as a character. */
544
13818c30 545EMACS_INT
971de7fb 546chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8
KH
547{
548 /* current_buffer is null at early stages of Emacs initialization. */
549 if (current_buffer == 0
550 || NILP (current_buffer->enable_multibyte_characters))
551 return nbytes;
552
553 return multibyte_chars_in_text (ptr, nbytes);
554}
555
556/* Return the number of characters in the NBYTES bytes at PTR.
557 This works by looking at the contents and checking for multibyte
558 sequences while assuming that there's no invalid sequence. It
559 ignores enable-multibyte-characters. */
560
13818c30 561EMACS_INT
971de7fb 562multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8 563{
8f924df7 564 const unsigned char *endp = ptr + nbytes;
579c18d0 565 EMACS_INT chars = 0;
0168c3d8
KH
566
567 while (ptr < endp)
568 {
579c18d0 569 EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
570
571 if (len == 0)
572 abort ();
573 ptr += len;
574 chars++;
575 }
576
577 return chars;
578}
579
580/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
581 characters and bytes in it, and store them in *NCHARS and *NBYTES
582 respectively. On counting bytes, pay attention to that 8-bit
583 characters not constructing a valid multibyte sequence are
584 represented by 2-byte in a multibyte text. */
585
586void
14162469
EZ
587parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
588 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 589{
8f924df7 590 const unsigned char *endp = str + len;
14162469 591 EMACS_INT n, chars = 0, bytes = 0;
0168c3d8
KH
592
593 if (len >= MAX_MULTIBYTE_LENGTH)
594 {
8f924df7 595 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
596 while (str < adjusted_endp)
597 {
fc9a17bc
KH
598 if (! CHAR_BYTE8_HEAD_P (*str)
599 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
600 str += n, bytes += n;
601 else
602 str++, bytes += 2;
603 chars++;
604 }
605 }
606 while (str < endp)
607 {
fc9a17bc
KH
608 if (! CHAR_BYTE8_HEAD_P (*str)
609 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
610 str += n, bytes += n;
611 else
612 str++, bytes += 2;
613 chars++;
614 }
615
616 *nchars = chars;
617 *nbytes = bytes;
618 return;
619}
620
621/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
622 It actually converts only such 8-bit characters that don't contruct
623 a multibyte sequence to multibyte forms of Latin-1 characters. If
624 NCHARS is nonzero, set *NCHARS to the number of characters in the
625 text. It is assured that we can use LEN bytes at STR as a work
626 area and that is enough. Return the number of bytes of the
627 resulting text. */
628
14162469
EZ
629EMACS_INT
630str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
631 EMACS_INT *nchars)
0168c3d8
KH
632{
633 unsigned char *p = str, *endp = str + nbytes;
634 unsigned char *to;
14162469 635 EMACS_INT chars = 0;
0168c3d8
KH
636 int n;
637
638 if (nbytes >= MAX_MULTIBYTE_LENGTH)
639 {
640 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
641 while (p < adjusted_endp
fc9a17bc 642 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
643 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
644 p += n, chars++;
645 }
fc9a17bc
KH
646 while (p < endp
647 && ! CHAR_BYTE8_HEAD_P (*p)
648 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
649 p += n, chars++;
650 if (nchars)
651 *nchars = chars;
652 if (p == endp)
653 return nbytes;
654
655 to = p;
656 nbytes = endp - p;
657 endp = str + len;
72af86bd 658 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
659 p = endp - nbytes;
660
661 if (nbytes >= MAX_MULTIBYTE_LENGTH)
662 {
663 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
664 while (p < adjusted_endp)
665 {
fc9a17bc
KH
666 if (! CHAR_BYTE8_HEAD_P (*p)
667 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
668 {
669 while (n--)
670 *to++ = *p++;
671 }
672 else
673 {
674 int c = *p++;
675 c = BYTE8_TO_CHAR (c);
676 to += CHAR_STRING (c, to);
677 }
678 }
679 chars++;
680 }
681 while (p < endp)
682 {
fc9a17bc
KH
683 if (! CHAR_BYTE8_HEAD_P (*p)
684 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
685 {
686 while (n--)
687 *to++ = *p++;
8f924df7 688 }
0168c3d8
KH
689 else
690 {
691 int c = *p++;
692 c = BYTE8_TO_CHAR (c);
693 to += CHAR_STRING (c, to);
694 }
695 chars++;
696 }
697 if (nchars)
698 *nchars = chars;
699 return (to - str);
700}
701
702/* Parse unibyte string at STR of LEN bytes, and return the number of
703 bytes it may ocupy when converted to multibyte string by
704 `str_to_multibyte'. */
705
14162469
EZ
706EMACS_INT
707parse_str_to_multibyte (const unsigned char *str, EMACS_INT len)
0168c3d8 708{
eec47d6b 709 const unsigned char *endp = str + len;
14162469 710 EMACS_INT bytes;
0168c3d8
KH
711
712 for (bytes = 0; str < endp; str++)
713 bytes += (*str < 0x80) ? 1 : 2;
714 return bytes;
715}
716
717
718/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
719 that contains the same single-byte characters. It actually
720 converts all 8-bit characters to multibyte forms. It is assured
721 that we can use LEN bytes at STR as a work area and that is
722 enough. */
723
14162469
EZ
724EMACS_INT
725str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
0168c3d8
KH
726{
727 unsigned char *p = str, *endp = str + bytes;
728 unsigned char *to;
729
730 while (p < endp && *p < 0x80) p++;
731 if (p == endp)
732 return bytes;
733 to = p;
734 bytes = endp - p;
735 endp = str + len;
72af86bd 736 memmove (endp - bytes, p, bytes);
0168c3d8 737 p = endp - bytes;
8f924df7 738 while (p < endp)
0168c3d8
KH
739 {
740 int c = *p++;
741
742 if (c >= 0x80)
743 c = BYTE8_TO_CHAR (c);
744 to += CHAR_STRING (c, to);
745 }
746 return (to - str);
747}
748
749/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
750 actually converts characters in the range 0x80..0xFF to
751 unibyte. */
752
14162469
EZ
753EMACS_INT
754str_as_unibyte (unsigned char *str, EMACS_INT bytes)
0168c3d8 755{
15843e6f
KH
756 const unsigned char *p = str, *endp = str + bytes;
757 unsigned char *to;
0168c3d8
KH
758 int c, len;
759
760 while (p < endp)
761 {
762 c = *p;
763 len = BYTES_BY_CHAR_HEAD (c);
764 if (CHAR_BYTE8_HEAD_P (c))
765 break;
766 p += len;
767 }
15843e6f 768 to = str + (p - str);
8f924df7 769 while (p < endp)
0168c3d8
KH
770 {
771 c = *p;
772 len = BYTES_BY_CHAR_HEAD (c);
773 if (CHAR_BYTE8_HEAD_P (c))
774 {
775 c = STRING_CHAR_ADVANCE (p);
776 *to++ = CHAR_TO_BYTE8 (c);
777 }
778 else
779 {
780 while (len--) *to++ = *p++;
781 }
782 }
783 return (to - str);
784}
785
4aa40bb8
KH
786/* Convert eight-bit chars in SRC (in multibyte form) to the
787 corresponding byte and store in DST. CHARS is the number of
788 characters in SRC. The value is the number of bytes stored in DST.
789 Usually, the value is the same as CHARS, but is less than it if SRC
790 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
791 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
792 of that character code.
793 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
794
795EMACS_INT
971de7fb 796str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
4aa40bb8
KH
797{
798 EMACS_INT i;
799
800 for (i = 0; i < chars; i++)
801 {
802 int c = STRING_CHAR_ADVANCE (src);
803
804 if (CHAR_BYTE8_P (c))
805 c = CHAR_TO_BYTE8 (c);
806 else if (! ASCII_CHAR_P (c)
807 && (! accept_latin_1 || c >= 0x100))
808 return i;
809 *dst++ = c;
810 }
811 return i;
812}
813
814
14162469 815EMACS_INT
971de7fb 816string_count_byte8 (Lisp_Object string)
0168c3d8
KH
817{
818 int multibyte = STRING_MULTIBYTE (string);
14162469 819 EMACS_INT nbytes = SBYTES (string);
8f924df7 820 unsigned char *p = SDATA (string);
0168c3d8 821 unsigned char *pend = p + nbytes;
14162469 822 EMACS_INT count = 0;
0168c3d8
KH
823 int c, len;
824
825 if (multibyte)
826 while (p < pend)
827 {
828 c = *p;
829 len = BYTES_BY_CHAR_HEAD (c);
830
831 if (CHAR_BYTE8_HEAD_P (c))
832 count++;
833 p += len;
834 }
835 else
836 while (p < pend)
837 {
838 if (*p++ >= 0x80)
839 count++;
840 }
841 return count;
842}
843
844
845Lisp_Object
971de7fb 846string_escape_byte8 (Lisp_Object string)
0168c3d8 847{
14162469
EZ
848 EMACS_INT nchars = SCHARS (string);
849 EMACS_INT nbytes = SBYTES (string);
0168c3d8 850 int multibyte = STRING_MULTIBYTE (string);
14162469 851 EMACS_INT byte8_count;
15843e6f
KH
852 const unsigned char *src, *src_end;
853 unsigned char *dst;
0168c3d8
KH
854 Lisp_Object val;
855 int c, len;
856
857 if (multibyte && nchars == nbytes)
858 return string;
859
860 byte8_count = string_count_byte8 (string);
861
862 if (byte8_count == 0)
863 return string;
864
865 if (multibyte)
14162469
EZ
866 {
867 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
868 || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
869 error ("Maximum string size exceeded");
870
871 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
872 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
873 nbytes + byte8_count * 2);
874 }
0168c3d8 875 else
14162469
EZ
876 {
877 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
878 error ("Maximum string size exceeded");
879 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
880 val = make_uninit_string (nbytes + byte8_count * 3);
881 }
0168c3d8 882
8f924df7 883 src = SDATA (string);
0168c3d8 884 src_end = src + nbytes;
8f924df7 885 dst = SDATA (val);
0168c3d8
KH
886 if (multibyte)
887 while (src < src_end)
888 {
889 c = *src;
890 len = BYTES_BY_CHAR_HEAD (c);
891
892 if (CHAR_BYTE8_HEAD_P (c))
893 {
894 c = STRING_CHAR_ADVANCE (src);
895 c = CHAR_TO_BYTE8 (c);
1889b238 896 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
897 dst += 4;
898 }
899 else
900 while (len--) *dst++ = *src++;
901 }
902 else
903 while (src < src_end)
904 {
905 c = *src++;
906 if (c >= 0x80)
907 {
1889b238 908 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
909 dst += 4;
910 }
911 else
912 *dst++ = c;
913 }
914 return val;
915}
916
917\f
8f924df7 918DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 919 doc: /*
d2e83296
DL
920Concatenate all the argument characters and make the result a string.
921usage: (string &rest CHARACTERS) */)
5842a27b 922 (int n, Lisp_Object *args)
0168c3d8 923{
754790b6
CY
924 int i, c;
925 unsigned char *buf, *p;
926 Lisp_Object str;
927 USE_SAFE_ALLOCA;
928
929 SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
930 p = buf;
0168c3d8
KH
931
932 for (i = 0; i < n; i++)
933 {
934 CHECK_CHARACTER (args[i]);
935 c = XINT (args[i]);
936 p += CHAR_STRING (c, p);
937 }
938
754790b6
CY
939 str = make_string_from_bytes ((char *) buf, n, p - buf);
940 SAFE_FREE ();
941 return str;
0168c3d8
KH
942}
943
70b4969d 944DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
945 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
946usage: (unibyte-string &rest BYTES) */)
5842a27b 947 (int n, Lisp_Object *args)
70b4969d 948{
754790b6
CY
949 int i, c;
950 unsigned char *buf, *p;
951 Lisp_Object str;
952 USE_SAFE_ALLOCA;
953
954 SAFE_ALLOCA (buf, unsigned char *, n);
955 p = buf;
70b4969d
KH
956
957 for (i = 0; i < n; i++)
958 {
959 CHECK_NATNUM (args[i]);
960 c = XFASTINT (args[i]);
961 if (c >= 256)
962 args_out_of_range_3 (args[i], make_number (0), make_number (255));
963 *p++ = c;
964 }
965
754790b6
CY
966 str = make_string_from_bytes ((char *) buf, n, p - buf);
967 SAFE_FREE ();
968 return str;
70b4969d
KH
969}
970
c73ae4ae 971DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
972 Schar_resolve_modifiers, 1, 1, 0,
973 doc: /* Resolve modifiers in the character CHAR.
974The value is a character with modifiers resolved into the character
975code. Unresolved modifiers are kept in the value.
c73ae4ae 976usage: (char-resolve-modifiers CHAR) */)
5842a27b 977 (Lisp_Object character)
d0363d44
KH
978{
979 int c;
980
981 CHECK_NUMBER (character);
982 c = XINT (character);
983 return make_number (char_resolve_modifier_mask (c));
984}
985
ee107a89
KH
986DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
987 doc: /* Return a byte value of a character at point.
988Optional 1st arg POSITION, if non-nil, is a position of a character to get
989a byte value.
990Optional 2nd arg STRING, if non-nil, is a string of which first
991character is a target to get a byte value. In this case, POSITION, if
992non-nil, is an index of a target character in the string.
993
994If the current buffer (or STRING) is multibyte, and the target
995character is not ASCII nor 8-bit character, an error is signalled. */)
5842a27b 996 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
997{
998 int c;
999 EMACS_INT pos;
1000 unsigned char *p;
1001
1002 if (NILP (string))
1003 {
1004 if (NILP (position))
1005 {
1006 p = PT_ADDR;
4d8e170e 1007 }
ee107a89
KH
1008 else
1009 {
1010 CHECK_NUMBER_COERCE_MARKER (position);
1011 if (XINT (position) < BEGV || XINT (position) >= ZV)
1012 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1013 pos = XFASTINT (position);
1014 p = CHAR_POS_ADDR (pos);
1015 }
d5998e03
KH
1016 if (NILP (current_buffer->enable_multibyte_characters))
1017 return make_number (*p);
ee107a89
KH
1018 }
1019 else
1020 {
1021 CHECK_STRING (string);
1022 if (NILP (position))
1023 {
1024 p = SDATA (string);
1025 }
1026 else
1027 {
1028 CHECK_NATNUM (position);
1029 if (XINT (position) >= SCHARS (string))
1030 args_out_of_range (string, position);
1031 pos = XFASTINT (position);
1032 p = SDATA (string) + string_char_to_byte (string, pos);
1033 }
d5998e03
KH
1034 if (! STRING_MULTIBYTE (string))
1035 return make_number (*p);
ee107a89 1036 }
62a6e103 1037 c = STRING_CHAR (p);
ee107a89
KH
1038 if (CHAR_BYTE8_P (c))
1039 c = CHAR_TO_BYTE8 (c);
1040 else if (! ASCII_CHAR_P (c))
1041 error ("Not an ASCII nor an 8-bit character: %d", c);
1042 return make_number (c);
1043}
1044
1045
0168c3d8 1046void
971de7fb 1047init_character_once (void)
0168c3d8
KH
1048{
1049}
1050
1051#ifdef emacs
1052
1053void
971de7fb 1054syms_of_character (void)
0168c3d8
KH
1055{
1056 DEFSYM (Qcharacterp, "characterp");
1057 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1058
1059 staticpro (&Vchar_unify_table);
1060 Vchar_unify_table = Qnil;
1061
1062 defsubr (&Smax_char);
1063 defsubr (&Scharacterp);
1064 defsubr (&Sunibyte_char_to_multibyte);
1065 defsubr (&Smultibyte_char_to_unibyte);
0168c3d8
KH
1066 defsubr (&Schar_width);
1067 defsubr (&Sstring_width);
1068 defsubr (&Schar_direction);
0168c3d8 1069 defsubr (&Sstring);
70b4969d 1070 defsubr (&Sunibyte_string);
d0363d44 1071 defsubr (&Schar_resolve_modifiers);
ee107a89 1072 defsubr (&Sget_byte);
0168c3d8
KH
1073
1074 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1075 doc: /*
68978cf0
DL
1076Vector recording all translation tables ever defined.
1077Each element is a pair (SYMBOL . TABLE) relating the table to the
1078symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1079 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1080
1081 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1082 doc: /*
1083A char-table for characters which invoke auto-filling.
1084Such characters have value t in this table. */);
1085 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1086 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1087 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
1088
1089 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1090 doc: /*
1091A char-table for width (columns) of each character. */);
1092 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1093 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1094 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1095 make_number (4));
0168c3d8
KH
1096
1097 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1098 doc: /* A char-table for direction of each character. */);
1099 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1100
1101 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1102 doc: /* A char-table for each printable character. */);
db6d4189 1103 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1104 Fset_char_table_range (Vprintable_chars,
1105 Fcons (make_number (32), make_number (126)), Qt);
1106 Fset_char_table_range (Vprintable_chars,
1107 Fcons (make_number (160),
1108 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1109
c57f3328
KH
1110 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1111 doc: /* Char table of script symbols.
1112It has one extra slot whose value is a list of script symbols. */);
1113
1114 /* Intern this now in case it isn't already done.
1115 Setting this variable twice is harmless.
1116 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1117 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1118 DEFSYM (Qchar_script_table, "char-script-table");
1119 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1120 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352
KH
1121
1122 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
c3bb7671 1123 doc: /* Alist of scripts vs the representative characters.
f4427a54 1124Each element is a cons (SCRIPT . CHARS).
4d8e170e 1125SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1126CHARS is a list or a vector of characters.
472a4dc9 1127If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1128If it is a vector, one of the characters in the vector is necessary.
1129This variable is used to find a font for a specific script. */);
c7e14352 1130 Vscript_representative_chars = Qnil;
a3cbb631
KH
1131
1132 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1133 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1134All Unicode characters have one of the following values (symbol):
1135 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1136 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1137See The Unicode Standard for the meaning of those values. */);
1138 /* The correct char-table is setup in characters.el. */
1139 Vunicode_category_table = Qnil;
0168c3d8
KH
1140}
1141
1142#endif /* emacs */
fbaf0946
MB
1143
1144/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1145 (do not change this comment) */