Convert (most) functions in src to standard C.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
114f9c96 4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
ec62e0ac 5 Free Software Foundation, Inc.
114f9c96 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
0168c3d8
KH
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
53/* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55Lisp_Object Vtranslation_table_vector;
56
57/* A char-table for characters which may invoke auto-filling. */
58Lisp_Object Vauto_fill_chars;
59
60Lisp_Object Qauto_fill_chars;
61
33f91981 62/* Char-table of information about which character to unify to which
6f1b43a0 63 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
64Lisp_Object Vchar_unify_table;
65
66/* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68Lisp_Object Vprintable_chars;
69
70/* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72Lisp_Object Vchar_width_table;
73
74/* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76Lisp_Object Vchar_direction_table;
77
8973478b 78/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 79unsigned char *_fetch_multibyte_char_p;
0168c3d8 80
c57f3328
KH
81/* Char table of scripts. */
82Lisp_Object Vchar_script_table;
83
c7e14352
KH
84/* Alist of scripts vs representative characters. */
85Lisp_Object Vscript_representative_chars;
86
c57f3328
KH
87static Lisp_Object Qchar_script_table;
88
a3cbb631 89Lisp_Object Vunicode_category_table;
0168c3d8
KH
90\f
91
2bde7652
KH
92/* If character code C has modifier masks, reflect them to the
93 character code if possible. Return the resulting code. */
94
95int
971de7fb 96char_resolve_modifier_mask (int c)
2bde7652 97{
d0363d44 98 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
99 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
100 return c;
101
102 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
103 if (c & CHAR_SHIFT)
104 {
105 /* Shift modifier is valid only with [A-Za-z]. */
106 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
107 c &= ~CHAR_SHIFT;
108 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
109 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
110 /* Shift modifier for control characters and SPC is ignored. */
111 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
112 c &= ~CHAR_SHIFT;
113 }
2bde7652
KH
114 if (c & CHAR_CTL)
115 {
116 /* Simulate the code in lread.c. */
117 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
118 if ((c & 0377) == ' ')
119 c &= ~0177 & ~ CHAR_CTL;
120 else if ((c & 0377) == '?')
121 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
122 /* ASCII control chars are made from letters (both cases),
123 as well as the non-letters within 0100...0137. */
124 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
125 c &= (037 | (~0177 & ~CHAR_CTL));
126 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
127 c &= (037 | (~0177 & ~CHAR_CTL));
128 }
e9c1637d 129#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
130 if (c & CHAR_META)
131 {
132 /* Move the meta bit to the right place for a string. */
133 c = (c & ~CHAR_META) | 0x80;
134 }
e9c1637d 135#endif
2bde7652
KH
136
137 return c;
138}
139
140
33f91981
KH
141/* Store multibyte form of character C at P. If C has modifier bits,
142 handle them appropriately. */
143
0168c3d8 144int
971de7fb 145char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
146{
147 int bytes;
148
e3d8eb8c
KH
149 if (c & CHAR_MODIFIER_MASK)
150 {
2bde7652 151 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
152 /* If C still has any modifier bits, just ignore it. */
153 c &= ~CHAR_MODIFIER_MASK;
154 }
155
0168c3d8
KH
156 MAYBE_UNIFY_CHAR (c);
157
e3d8eb8c 158 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
159 {
160 bytes = CHAR_STRING (c, p);
161 }
162 else if (c <= MAX_4_BYTE_CHAR)
163 {
164 p[0] = (0xF0 | (c >> 18));
165 p[1] = (0x80 | ((c >> 12) & 0x3F));
166 p[2] = (0x80 | ((c >> 6) & 0x3F));
167 p[3] = (0x80 | (c & 0x3F));
168 bytes = 4;
169 }
e3d8eb8c 170 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
171 {
172 p[0] = 0xF8;
173 p[1] = (0x80 | ((c >> 18) & 0x0F));
174 p[2] = (0x80 | ((c >> 12) & 0x3F));
175 p[3] = (0x80 | ((c >> 6) & 0x3F));
176 p[4] = (0x80 | (c & 0x3F));
177 bytes = 5;
178 }
5aa91c9b 179 else if (c <= MAX_CHAR)
e3d8eb8c
KH
180 {
181 c = CHAR_TO_BYTE8 (c);
182 bytes = BYTE8_STRING (c, p);
183 }
5aa91c9b
KH
184 else
185 error ("Invalid character: %d", c);
1889b238 186
0168c3d8
KH
187 return bytes;
188}
189
190
33f91981
KH
191/* Return a character whose multibyte form is at P. Set LEN is not
192 NULL, it must be a pointer to integer. In that case, set *LEN to
193 the byte length of the multibyte form. If ADVANCED is not NULL, is
194 must be a pointer to unsigned char. In that case, set *ADVANCED to
195 the ending address (i.e. the starting address of the next
196 character) of the multibyte form. */
197
0168c3d8 198int
971de7fb 199string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 200{
1889b238 201 int c;
15843e6f 202 const unsigned char *saved_p = p;
0168c3d8
KH
203
204 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
205 {
206 c = STRING_CHAR_ADVANCE (p);
207 }
208 else if (! (*p & 0x08))
209 {
210 c = ((((p)[0] & 0xF) << 18)
211 | (((p)[1] & 0x3F) << 12)
212 | (((p)[2] & 0x3F) << 6)
213 | ((p)[3] & 0x3F));
214 p += 4;
215 }
216 else
217 {
218 c = ((((p)[1] & 0x3F) << 18)
219 | (((p)[2] & 0x3F) << 12)
220 | (((p)[3] & 0x3F) << 6)
221 | ((p)[4] & 0x3F));
222 p += 5;
223 }
224
225 MAYBE_UNIFY_CHAR (c);
226
227 if (len)
228 *len = p - saved_p;
229 if (advanced)
230 *advanced = p;
231 return c;
232}
233
234
235/* Translate character C by translation table TABLE. If C is
236 negative, translate a character specified by CHARSET and CODE. If
237 no translation is found in TABLE, return the untranslated
10453be9
KH
238 character. If TABLE is a list, elements are char tables. In this
239 case, translace C by all tables. */
0168c3d8
KH
240
241int
971de7fb 242translate_char (Lisp_Object table, int c)
0168c3d8 243{
10453be9
KH
244 if (CHAR_TABLE_P (table))
245 {
246 Lisp_Object ch;
247
248 ch = CHAR_TABLE_REF (table, c);
249 if (CHARACTERP (ch))
250 c = XINT (ch);
251 }
252 else
253 {
254 for (; CONSP (table); table = XCDR (table))
255 c = translate_char (XCAR (table), c);
256 }
257 return c;
0168c3d8
KH
258}
259
2e5db15c
KH
260/* Convert ASCII or 8-bit character C to unibyte. If C is none of
261 them, return (C & 0xFF).
0168c3d8
KH
262
263 The argument REV_TBL is now ignored. It will be removed in the
264 future. */
265
266int
971de7fb 267multibyte_char_to_unibyte (int c, Lisp_Object rev_tbl)
0168c3d8 268{
2e5db15c
KH
269 if (c < 0x80)
270 return c;
b672c5ae
KH
271 if (CHAR_BYTE8_P (c))
272 return CHAR_TO_BYTE8 (c);
2e5db15c 273 return (c & 0xFF);
0168c3d8
KH
274}
275
935d5b02
KH
276/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
277 by charset_unibyte. */
278
279int
971de7fb 280multibyte_char_to_unibyte_safe (int c)
935d5b02 281{
2e5db15c
KH
282 if (c < 0x80)
283 return c;
935d5b02
KH
284 if (CHAR_BYTE8_P (c))
285 return CHAR_TO_BYTE8 (c);
2e5db15c 286 return -1;
935d5b02 287}
0168c3d8
KH
288
289DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
290 doc: /* Return non-nil if OBJECT is a character. */)
291 (object, ignore)
292 Lisp_Object object, ignore;
293{
294 return (CHARACTERP (object) ? Qt : Qnil);
295}
296
297DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
298 doc: /* Return the character of the maximum code. */)
299 ()
300{
301 return make_number (MAX_CHAR);
302}
303
304DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
305 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 306 doc: /* Convert the byte CH to multibyte character. */)
0168c3d8
KH
307 (ch)
308 Lisp_Object ch;
309{
310 int c;
0168c3d8
KH
311
312 CHECK_CHARACTER (ch);
313 c = XFASTINT (ch);
2e5db15c
KH
314 if (c >= 0x100)
315 error ("Not a unibyte character: %d", c);
4c0354d7 316 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
317 return make_number (c);
318}
319
320DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
321 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
322 doc: /* Convert the multibyte character CH to a byte.
323If the multibyte character does not represent a byte, return -1. */)
0168c3d8
KH
324 (ch)
325 Lisp_Object ch;
326{
5556875b 327 int cm;
0168c3d8
KH
328
329 CHECK_CHARACTER (ch);
5556875b
SM
330 cm = XFASTINT (ch);
331 if (cm < 256)
332 /* Can't distinguish a byte read from a unibyte buffer from
333 a latin1 char, so let's let it slide. */
334 return ch;
335 else
336 {
2afc21f5 337 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
338 return make_number (cu);
339 }
0168c3d8
KH
340}
341
342DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
343 doc: /* Return 1 regardless of the argument CHAR.
add553ac
JB
344This is now an obsolete function. We keep it just for backward compatibility.
345usage: (char-bytes CHAR) */)
0168c3d8
KH
346 (ch)
347 Lisp_Object ch;
348{
349 CHECK_CHARACTER (ch);
350 return make_number (1);
351}
352
353DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
354 doc: /* Return width of CHAR when displayed in the current buffer.
355The width is measured by how many columns it occupies on the screen.
add553ac
JB
356Tab is taken to occupy `tab-width' columns.
357usage: (char-width CHAR) */)
0168c3d8
KH
358 (ch)
359 Lisp_Object ch;
360{
361 Lisp_Object disp;
362 int c, width;
363 struct Lisp_Char_Table *dp = buffer_display_table ();
364
365 CHECK_CHARACTER (ch);
366 c = XINT (ch);
367
368 /* Get the way the display table would display it. */
369 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
370
371 if (VECTORP (disp))
372 width = ASIZE (disp);
373 else
374 width = CHAR_WIDTH (c);
375
376 return make_number (width);
377}
378
0168c3d8
KH
379/* Return width of string STR of length LEN when displayed in the
380 current buffer. The width is measured by how many columns it
381 occupies on the screen. If PRECISION > 0, return the width of
382 longest substring that doesn't exceed PRECISION, and set number of
383 characters and bytes of the substring in *NCHARS and *NBYTES
384 respectively. */
385
1889b238 386int
63c125ab 387c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
0168c3d8
KH
388{
389 int i = 0, i_byte = 0;
390 int width = 0;
391 struct Lisp_Char_Table *dp = buffer_display_table ();
392
393 while (i_byte < len)
394 {
395 int bytes, thiswidth;
396 Lisp_Object val;
62a6e103 397 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
398
399 if (dp)
400 {
401 val = DISP_CHAR_VECTOR (dp, c);
402 if (VECTORP (val))
403 thiswidth = XVECTOR (val)->size;
404 else
405 thiswidth = CHAR_WIDTH (c);
406 }
407 else
408 {
409 thiswidth = CHAR_WIDTH (c);
410 }
411
412 if (precision > 0
413 && (width + thiswidth > precision))
414 {
415 *nchars = i;
416 *nbytes = i_byte;
417 return width;
418 }
419 i++;
420 i_byte += bytes;
421 width += thiswidth;
422 }
423
424 if (precision > 0)
425 {
426 *nchars = i;
427 *nbytes = i_byte;
428 }
429
430 return width;
431}
432
1889b238
KH
433/* Return width of string STR of length LEN when displayed in the
434 current buffer. The width is measured by how many columns it
435 occupies on the screen. */
436
437int
971de7fb 438strwidth (unsigned char *str, int len)
1889b238
KH
439{
440 return c_string_width (str, len, -1, NULL, NULL);
441}
442
0168c3d8
KH
443/* Return width of Lisp string STRING when displayed in the current
444 buffer. The width is measured by how many columns it occupies on
445 the screen while paying attention to compositions. If PRECISION >
446 0, return the width of longest substring that doesn't exceed
447 PRECISION, and set number of characters and bytes of the substring
448 in *NCHARS and *NBYTES respectively. */
449
450int
971de7fb 451lisp_string_width (Lisp_Object string, int precision, int *nchars, int *nbytes)
0168c3d8 452{
8f924df7 453 int len = SCHARS (string);
0aee65b9
KH
454 /* This set multibyte to 0 even if STRING is multibyte when it
455 contains only ascii and eight-bit-graphic, but that's
456 intentional. */
457 int multibyte = len < SBYTES (string);
8f924df7 458 unsigned char *str = SDATA (string);
0168c3d8
KH
459 int i = 0, i_byte = 0;
460 int width = 0;
461 struct Lisp_Char_Table *dp = buffer_display_table ();
462
463 while (i < len)
464 {
465 int chars, bytes, thiswidth;
466 Lisp_Object val;
467 int cmp_id;
f4bc0685 468 EMACS_INT ignore, end;
0168c3d8
KH
469
470 if (find_composition (i, -1, &ignore, &end, &val, string)
471 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
472 >= 0))
473 {
474 thiswidth = composition_table[cmp_id]->width;
475 chars = end - i;
476 bytes = string_char_to_byte (string, end) - i_byte;
477 }
0168c3d8
KH
478 else
479 {
0aee65b9 480 int c;
0168c3d8 481
0aee65b9 482 if (multibyte)
62a6e103 483 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0aee65b9
KH
484 else
485 c = str[i_byte], bytes = 1;
0168c3d8 486 chars = 1;
0aee65b9
KH
487 if (dp)
488 {
489 val = DISP_CHAR_VECTOR (dp, c);
490 if (VECTORP (val))
491 thiswidth = XVECTOR (val)->size;
492 else
493 thiswidth = CHAR_WIDTH (c);
494 }
495 else
496 {
497 thiswidth = CHAR_WIDTH (c);
498 }
0168c3d8
KH
499 }
500
501 if (precision > 0
502 && (width + thiswidth > precision))
503 {
504 *nchars = i;
505 *nbytes = i_byte;
506 return width;
507 }
508 i += chars;
509 i_byte += bytes;
510 width += thiswidth;
511 }
512
513 if (precision > 0)
514 {
515 *nchars = i;
516 *nbytes = i_byte;
517 }
518
519 return width;
520}
521
522DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
523 doc: /* Return width of STRING when displayed in the current buffer.
524Width is measured by how many columns it occupies on the screen.
525When calculating width of a multibyte character in STRING,
526only the base leading-code is considered; the validity of
527the following bytes is not checked. Tabs in STRING are always
add553ac
JB
528taken to occupy `tab-width' columns.
529usage: (string-width STRING) */)
0168c3d8
KH
530 (str)
531 Lisp_Object str;
532{
533 Lisp_Object val;
534
535 CHECK_STRING (str);
536 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
537 return val;
538}
539
540DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
541 doc: /* Return the direction of CHAR.
add553ac
JB
542The returned value is 0 for left-to-right and 1 for right-to-left.
543usage: (char-direction CHAR) */)
0168c3d8
KH
544 (ch)
545 Lisp_Object ch;
546{
547 int c;
548
549 CHECK_CHARACTER (ch);
550 c = XINT (ch);
551 return CHAR_TABLE_REF (Vchar_direction_table, c);
552}
553
0168c3d8
KH
554/* Return the number of characters in the NBYTES bytes at PTR.
555 This works by looking at the contents and checking for multibyte
556 sequences while assuming that there's no invalid sequence.
557 However, if the current buffer has enable-multibyte-characters =
558 nil, we treat each byte as a character. */
559
13818c30 560EMACS_INT
971de7fb 561chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8
KH
562{
563 /* current_buffer is null at early stages of Emacs initialization. */
564 if (current_buffer == 0
565 || NILP (current_buffer->enable_multibyte_characters))
566 return nbytes;
567
568 return multibyte_chars_in_text (ptr, nbytes);
569}
570
571/* Return the number of characters in the NBYTES bytes at PTR.
572 This works by looking at the contents and checking for multibyte
573 sequences while assuming that there's no invalid sequence. It
574 ignores enable-multibyte-characters. */
575
13818c30 576EMACS_INT
971de7fb 577multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8 578{
8f924df7 579 const unsigned char *endp = ptr + nbytes;
0168c3d8
KH
580 int chars = 0;
581
582 while (ptr < endp)
583 {
584 int len = MULTIBYTE_LENGTH (ptr, endp);
585
586 if (len == 0)
587 abort ();
588 ptr += len;
589 chars++;
590 }
591
592 return chars;
593}
594
595/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
596 characters and bytes in it, and store them in *NCHARS and *NBYTES
597 respectively. On counting bytes, pay attention to that 8-bit
598 characters not constructing a valid multibyte sequence are
599 represented by 2-byte in a multibyte text. */
600
601void
971de7fb 602parse_str_as_multibyte (const unsigned char *str, int len, int *nchars, int *nbytes)
0168c3d8 603{
8f924df7 604 const unsigned char *endp = str + len;
0168c3d8
KH
605 int n, chars = 0, bytes = 0;
606
607 if (len >= MAX_MULTIBYTE_LENGTH)
608 {
8f924df7 609 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
610 while (str < adjusted_endp)
611 {
fc9a17bc
KH
612 if (! CHAR_BYTE8_HEAD_P (*str)
613 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
614 str += n, bytes += n;
615 else
616 str++, bytes += 2;
617 chars++;
618 }
619 }
620 while (str < endp)
621 {
fc9a17bc
KH
622 if (! CHAR_BYTE8_HEAD_P (*str)
623 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
624 str += n, bytes += n;
625 else
626 str++, bytes += 2;
627 chars++;
628 }
629
630 *nchars = chars;
631 *nbytes = bytes;
632 return;
633}
634
635/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
636 It actually converts only such 8-bit characters that don't contruct
637 a multibyte sequence to multibyte forms of Latin-1 characters. If
638 NCHARS is nonzero, set *NCHARS to the number of characters in the
639 text. It is assured that we can use LEN bytes at STR as a work
640 area and that is enough. Return the number of bytes of the
641 resulting text. */
642
643int
971de7fb 644str_as_multibyte (unsigned char *str, int len, int nbytes, int *nchars)
0168c3d8
KH
645{
646 unsigned char *p = str, *endp = str + nbytes;
647 unsigned char *to;
648 int chars = 0;
649 int n;
650
651 if (nbytes >= MAX_MULTIBYTE_LENGTH)
652 {
653 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
654 while (p < adjusted_endp
fc9a17bc 655 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
656 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
657 p += n, chars++;
658 }
fc9a17bc
KH
659 while (p < endp
660 && ! CHAR_BYTE8_HEAD_P (*p)
661 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
662 p += n, chars++;
663 if (nchars)
664 *nchars = chars;
665 if (p == endp)
666 return nbytes;
667
668 to = p;
669 nbytes = endp - p;
670 endp = str + len;
671 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
672 p = endp - nbytes;
673
674 if (nbytes >= MAX_MULTIBYTE_LENGTH)
675 {
676 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
677 while (p < adjusted_endp)
678 {
fc9a17bc
KH
679 if (! CHAR_BYTE8_HEAD_P (*p)
680 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
681 {
682 while (n--)
683 *to++ = *p++;
684 }
685 else
686 {
687 int c = *p++;
688 c = BYTE8_TO_CHAR (c);
689 to += CHAR_STRING (c, to);
690 }
691 }
692 chars++;
693 }
694 while (p < endp)
695 {
fc9a17bc
KH
696 if (! CHAR_BYTE8_HEAD_P (*p)
697 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
698 {
699 while (n--)
700 *to++ = *p++;
8f924df7 701 }
0168c3d8
KH
702 else
703 {
704 int c = *p++;
705 c = BYTE8_TO_CHAR (c);
706 to += CHAR_STRING (c, to);
707 }
708 chars++;
709 }
710 if (nchars)
711 *nchars = chars;
712 return (to - str);
713}
714
715/* Parse unibyte string at STR of LEN bytes, and return the number of
716 bytes it may ocupy when converted to multibyte string by
717 `str_to_multibyte'. */
718
719int
971de7fb 720parse_str_to_multibyte (unsigned char *str, int len)
0168c3d8
KH
721{
722 unsigned char *endp = str + len;
723 int bytes;
724
725 for (bytes = 0; str < endp; str++)
726 bytes += (*str < 0x80) ? 1 : 2;
727 return bytes;
728}
729
730
731/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
732 that contains the same single-byte characters. It actually
733 converts all 8-bit characters to multibyte forms. It is assured
734 that we can use LEN bytes at STR as a work area and that is
735 enough. */
736
737int
971de7fb 738str_to_multibyte (unsigned char *str, int len, int bytes)
0168c3d8
KH
739{
740 unsigned char *p = str, *endp = str + bytes;
741 unsigned char *to;
742
743 while (p < endp && *p < 0x80) p++;
744 if (p == endp)
745 return bytes;
746 to = p;
747 bytes = endp - p;
748 endp = str + len;
749 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
750 p = endp - bytes;
8f924df7 751 while (p < endp)
0168c3d8
KH
752 {
753 int c = *p++;
754
755 if (c >= 0x80)
756 c = BYTE8_TO_CHAR (c);
757 to += CHAR_STRING (c, to);
758 }
759 return (to - str);
760}
761
762/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
763 actually converts characters in the range 0x80..0xFF to
764 unibyte. */
765
766int
971de7fb 767str_as_unibyte (unsigned char *str, int bytes)
0168c3d8 768{
15843e6f
KH
769 const unsigned char *p = str, *endp = str + bytes;
770 unsigned char *to;
0168c3d8
KH
771 int c, len;
772
773 while (p < endp)
774 {
775 c = *p;
776 len = BYTES_BY_CHAR_HEAD (c);
777 if (CHAR_BYTE8_HEAD_P (c))
778 break;
779 p += len;
780 }
15843e6f 781 to = str + (p - str);
8f924df7 782 while (p < endp)
0168c3d8
KH
783 {
784 c = *p;
785 len = BYTES_BY_CHAR_HEAD (c);
786 if (CHAR_BYTE8_HEAD_P (c))
787 {
788 c = STRING_CHAR_ADVANCE (p);
789 *to++ = CHAR_TO_BYTE8 (c);
790 }
791 else
792 {
793 while (len--) *to++ = *p++;
794 }
795 }
796 return (to - str);
797}
798
4aa40bb8
KH
799/* Convert eight-bit chars in SRC (in multibyte form) to the
800 corresponding byte and store in DST. CHARS is the number of
801 characters in SRC. The value is the number of bytes stored in DST.
802 Usually, the value is the same as CHARS, but is less than it if SRC
803 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
804 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
805 of that character code.
806 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
807
808EMACS_INT
971de7fb 809str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
4aa40bb8
KH
810{
811 EMACS_INT i;
812
813 for (i = 0; i < chars; i++)
814 {
815 int c = STRING_CHAR_ADVANCE (src);
816
817 if (CHAR_BYTE8_P (c))
818 c = CHAR_TO_BYTE8 (c);
819 else if (! ASCII_CHAR_P (c)
820 && (! accept_latin_1 || c >= 0x100))
821 return i;
822 *dst++ = c;
823 }
824 return i;
825}
826
827
0168c3d8 828int
971de7fb 829string_count_byte8 (Lisp_Object string)
0168c3d8
KH
830{
831 int multibyte = STRING_MULTIBYTE (string);
8f924df7
KH
832 int nbytes = SBYTES (string);
833 unsigned char *p = SDATA (string);
0168c3d8
KH
834 unsigned char *pend = p + nbytes;
835 int count = 0;
836 int c, len;
837
838 if (multibyte)
839 while (p < pend)
840 {
841 c = *p;
842 len = BYTES_BY_CHAR_HEAD (c);
843
844 if (CHAR_BYTE8_HEAD_P (c))
845 count++;
846 p += len;
847 }
848 else
849 while (p < pend)
850 {
851 if (*p++ >= 0x80)
852 count++;
853 }
854 return count;
855}
856
857
858Lisp_Object
971de7fb 859string_escape_byte8 (Lisp_Object string)
0168c3d8 860{
8f924df7
KH
861 int nchars = SCHARS (string);
862 int nbytes = SBYTES (string);
0168c3d8
KH
863 int multibyte = STRING_MULTIBYTE (string);
864 int byte8_count;
15843e6f
KH
865 const unsigned char *src, *src_end;
866 unsigned char *dst;
0168c3d8
KH
867 Lisp_Object val;
868 int c, len;
869
870 if (multibyte && nchars == nbytes)
871 return string;
872
873 byte8_count = string_count_byte8 (string);
874
875 if (byte8_count == 0)
876 return string;
877
878 if (multibyte)
879 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
7b40ebaf 880 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
0168c3d8
KH
881 nbytes + byte8_count * 2);
882 else
883 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
884 val = make_uninit_string (nbytes + byte8_count * 3);
885
8f924df7 886 src = SDATA (string);
0168c3d8 887 src_end = src + nbytes;
8f924df7 888 dst = SDATA (val);
0168c3d8
KH
889 if (multibyte)
890 while (src < src_end)
891 {
892 c = *src;
893 len = BYTES_BY_CHAR_HEAD (c);
894
895 if (CHAR_BYTE8_HEAD_P (c))
896 {
897 c = STRING_CHAR_ADVANCE (src);
898 c = CHAR_TO_BYTE8 (c);
1889b238 899 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
900 dst += 4;
901 }
902 else
903 while (len--) *dst++ = *src++;
904 }
905 else
906 while (src < src_end)
907 {
908 c = *src++;
909 if (c >= 0x80)
910 {
1889b238 911 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
912 dst += 4;
913 }
914 else
915 *dst++ = c;
916 }
917 return val;
918}
919
920\f
8f924df7 921DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 922 doc: /*
d2e83296
DL
923Concatenate all the argument characters and make the result a string.
924usage: (string &rest CHARACTERS) */)
0168c3d8
KH
925 (n, args)
926 int n;
927 Lisp_Object *args;
928{
754790b6
CY
929 int i, c;
930 unsigned char *buf, *p;
931 Lisp_Object str;
932 USE_SAFE_ALLOCA;
933
934 SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
935 p = buf;
0168c3d8
KH
936
937 for (i = 0; i < n; i++)
938 {
939 CHECK_CHARACTER (args[i]);
940 c = XINT (args[i]);
941 p += CHAR_STRING (c, p);
942 }
943
754790b6
CY
944 str = make_string_from_bytes ((char *) buf, n, p - buf);
945 SAFE_FREE ();
946 return str;
0168c3d8
KH
947}
948
70b4969d 949DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
950 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
951usage: (unibyte-string &rest BYTES) */)
70b4969d
KH
952 (n, args)
953 int n;
954 Lisp_Object *args;
955{
754790b6
CY
956 int i, c;
957 unsigned char *buf, *p;
958 Lisp_Object str;
959 USE_SAFE_ALLOCA;
960
961 SAFE_ALLOCA (buf, unsigned char *, n);
962 p = buf;
70b4969d
KH
963
964 for (i = 0; i < n; i++)
965 {
966 CHECK_NATNUM (args[i]);
967 c = XFASTINT (args[i]);
968 if (c >= 256)
969 args_out_of_range_3 (args[i], make_number (0), make_number (255));
970 *p++ = c;
971 }
972
754790b6
CY
973 str = make_string_from_bytes ((char *) buf, n, p - buf);
974 SAFE_FREE ();
975 return str;
70b4969d
KH
976}
977
c73ae4ae 978DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
979 Schar_resolve_modifiers, 1, 1, 0,
980 doc: /* Resolve modifiers in the character CHAR.
981The value is a character with modifiers resolved into the character
982code. Unresolved modifiers are kept in the value.
c73ae4ae 983usage: (char-resolve-modifiers CHAR) */)
d0363d44
KH
984 (character)
985 Lisp_Object character;
986{
987 int c;
988
989 CHECK_NUMBER (character);
990 c = XINT (character);
991 return make_number (char_resolve_modifier_mask (c));
992}
993
ee107a89
KH
994DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
995 doc: /* Return a byte value of a character at point.
996Optional 1st arg POSITION, if non-nil, is a position of a character to get
997a byte value.
998Optional 2nd arg STRING, if non-nil, is a string of which first
999character is a target to get a byte value. In this case, POSITION, if
1000non-nil, is an index of a target character in the string.
1001
1002If the current buffer (or STRING) is multibyte, and the target
1003character is not ASCII nor 8-bit character, an error is signalled. */)
1004 (position, string)
1005 Lisp_Object position, string;
1006{
1007 int c;
1008 EMACS_INT pos;
1009 unsigned char *p;
1010
1011 if (NILP (string))
1012 {
1013 if (NILP (position))
1014 {
1015 p = PT_ADDR;
4d8e170e 1016 }
ee107a89
KH
1017 else
1018 {
1019 CHECK_NUMBER_COERCE_MARKER (position);
1020 if (XINT (position) < BEGV || XINT (position) >= ZV)
1021 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1022 pos = XFASTINT (position);
1023 p = CHAR_POS_ADDR (pos);
1024 }
d5998e03
KH
1025 if (NILP (current_buffer->enable_multibyte_characters))
1026 return make_number (*p);
ee107a89
KH
1027 }
1028 else
1029 {
1030 CHECK_STRING (string);
1031 if (NILP (position))
1032 {
1033 p = SDATA (string);
1034 }
1035 else
1036 {
1037 CHECK_NATNUM (position);
1038 if (XINT (position) >= SCHARS (string))
1039 args_out_of_range (string, position);
1040 pos = XFASTINT (position);
1041 p = SDATA (string) + string_char_to_byte (string, pos);
1042 }
d5998e03
KH
1043 if (! STRING_MULTIBYTE (string))
1044 return make_number (*p);
ee107a89 1045 }
62a6e103 1046 c = STRING_CHAR (p);
ee107a89
KH
1047 if (CHAR_BYTE8_P (c))
1048 c = CHAR_TO_BYTE8 (c);
1049 else if (! ASCII_CHAR_P (c))
1050 error ("Not an ASCII nor an 8-bit character: %d", c);
1051 return make_number (c);
1052}
1053
1054
0168c3d8 1055void
971de7fb 1056init_character_once (void)
0168c3d8
KH
1057{
1058}
1059
1060#ifdef emacs
1061
1062void
971de7fb 1063syms_of_character (void)
0168c3d8
KH
1064{
1065 DEFSYM (Qcharacterp, "characterp");
1066 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1067
1068 staticpro (&Vchar_unify_table);
1069 Vchar_unify_table = Qnil;
1070
1071 defsubr (&Smax_char);
1072 defsubr (&Scharacterp);
1073 defsubr (&Sunibyte_char_to_multibyte);
1074 defsubr (&Smultibyte_char_to_unibyte);
1075 defsubr (&Schar_bytes);
1076 defsubr (&Schar_width);
1077 defsubr (&Sstring_width);
1078 defsubr (&Schar_direction);
0168c3d8 1079 defsubr (&Sstring);
70b4969d 1080 defsubr (&Sunibyte_string);
d0363d44 1081 defsubr (&Schar_resolve_modifiers);
ee107a89 1082 defsubr (&Sget_byte);
0168c3d8
KH
1083
1084 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1085 doc: /*
68978cf0
DL
1086Vector recording all translation tables ever defined.
1087Each element is a pair (SYMBOL . TABLE) relating the table to the
1088symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1089 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1090
1091 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1092 doc: /*
1093A char-table for characters which invoke auto-filling.
1094Such characters have value t in this table. */);
1095 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1096 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1097 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
1098
1099 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1100 doc: /*
1101A char-table for width (columns) of each character. */);
1102 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1103 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1104 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1105 make_number (4));
0168c3d8
KH
1106
1107 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1108 doc: /* A char-table for direction of each character. */);
1109 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1110
1111 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1112 doc: /* A char-table for each printable character. */);
db6d4189 1113 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1114 Fset_char_table_range (Vprintable_chars,
1115 Fcons (make_number (32), make_number (126)), Qt);
1116 Fset_char_table_range (Vprintable_chars,
1117 Fcons (make_number (160),
1118 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1119
c57f3328
KH
1120 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1121 doc: /* Char table of script symbols.
1122It has one extra slot whose value is a list of script symbols. */);
1123
1124 /* Intern this now in case it isn't already done.
1125 Setting this variable twice is harmless.
1126 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1127 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1128 DEFSYM (Qchar_script_table, "char-script-table");
1129 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1130 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352
KH
1131
1132 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
c3bb7671 1133 doc: /* Alist of scripts vs the representative characters.
f4427a54 1134Each element is a cons (SCRIPT . CHARS).
4d8e170e 1135SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1136CHARS is a list or a vector of characters.
472a4dc9 1137If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1138If it is a vector, one of the characters in the vector is necessary.
1139This variable is used to find a font for a specific script. */);
c7e14352 1140 Vscript_representative_chars = Qnil;
a3cbb631
KH
1141
1142 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1143 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1144All Unicode characters have one of the following values (symbol):
1145 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1146 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1147See The Unicode Standard for the meaning of those values. */);
1148 /* The correct char-table is setup in characters.el. */
1149 Vunicode_category_table = Qnil;
0168c3d8
KH
1150}
1151
1152#endif /* emacs */
fbaf0946
MB
1153
1154/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1155 (do not change this comment) */