Merge changes made in Gnus trunk.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
114f9c96 4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
ec62e0ac 5 Free Software Foundation, Inc.
114f9c96 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
0168c3d8
KH
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
53/* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55Lisp_Object Vtranslation_table_vector;
56
57/* A char-table for characters which may invoke auto-filling. */
58Lisp_Object Vauto_fill_chars;
59
60Lisp_Object Qauto_fill_chars;
61
33f91981 62/* Char-table of information about which character to unify to which
6f1b43a0 63 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
64Lisp_Object Vchar_unify_table;
65
66/* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68Lisp_Object Vprintable_chars;
69
70/* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72Lisp_Object Vchar_width_table;
73
74/* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76Lisp_Object Vchar_direction_table;
77
8973478b 78/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 79unsigned char *_fetch_multibyte_char_p;
0168c3d8 80
c57f3328
KH
81/* Char table of scripts. */
82Lisp_Object Vchar_script_table;
83
c7e14352
KH
84/* Alist of scripts vs representative characters. */
85Lisp_Object Vscript_representative_chars;
86
c57f3328
KH
87static Lisp_Object Qchar_script_table;
88
a3cbb631 89Lisp_Object Vunicode_category_table;
0168c3d8
KH
90\f
91
2bde7652
KH
92/* If character code C has modifier masks, reflect them to the
93 character code if possible. Return the resulting code. */
94
95int
971de7fb 96char_resolve_modifier_mask (int c)
2bde7652 97{
d0363d44 98 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
99 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
100 return c;
101
102 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
103 if (c & CHAR_SHIFT)
104 {
105 /* Shift modifier is valid only with [A-Za-z]. */
106 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
107 c &= ~CHAR_SHIFT;
108 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
109 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
110 /* Shift modifier for control characters and SPC is ignored. */
111 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
112 c &= ~CHAR_SHIFT;
113 }
2bde7652
KH
114 if (c & CHAR_CTL)
115 {
116 /* Simulate the code in lread.c. */
117 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
118 if ((c & 0377) == ' ')
119 c &= ~0177 & ~ CHAR_CTL;
120 else if ((c & 0377) == '?')
121 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
122 /* ASCII control chars are made from letters (both cases),
123 as well as the non-letters within 0100...0137. */
124 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
125 c &= (037 | (~0177 & ~CHAR_CTL));
126 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
127 c &= (037 | (~0177 & ~CHAR_CTL));
128 }
e9c1637d 129#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
130 if (c & CHAR_META)
131 {
132 /* Move the meta bit to the right place for a string. */
133 c = (c & ~CHAR_META) | 0x80;
134 }
e9c1637d 135#endif
2bde7652
KH
136
137 return c;
138}
139
140
33f91981
KH
141/* Store multibyte form of character C at P. If C has modifier bits,
142 handle them appropriately. */
143
0168c3d8 144int
971de7fb 145char_string (unsigned int c, unsigned char *p)
0168c3d8
KH
146{
147 int bytes;
148
e3d8eb8c
KH
149 if (c & CHAR_MODIFIER_MASK)
150 {
2bde7652 151 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
152 /* If C still has any modifier bits, just ignore it. */
153 c &= ~CHAR_MODIFIER_MASK;
154 }
155
0168c3d8
KH
156 MAYBE_UNIFY_CHAR (c);
157
e3d8eb8c 158 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
159 {
160 bytes = CHAR_STRING (c, p);
161 }
162 else if (c <= MAX_4_BYTE_CHAR)
163 {
164 p[0] = (0xF0 | (c >> 18));
165 p[1] = (0x80 | ((c >> 12) & 0x3F));
166 p[2] = (0x80 | ((c >> 6) & 0x3F));
167 p[3] = (0x80 | (c & 0x3F));
168 bytes = 4;
169 }
e3d8eb8c 170 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
171 {
172 p[0] = 0xF8;
173 p[1] = (0x80 | ((c >> 18) & 0x0F));
174 p[2] = (0x80 | ((c >> 12) & 0x3F));
175 p[3] = (0x80 | ((c >> 6) & 0x3F));
176 p[4] = (0x80 | (c & 0x3F));
177 bytes = 5;
178 }
5aa91c9b 179 else if (c <= MAX_CHAR)
e3d8eb8c
KH
180 {
181 c = CHAR_TO_BYTE8 (c);
182 bytes = BYTE8_STRING (c, p);
183 }
5aa91c9b
KH
184 else
185 error ("Invalid character: %d", c);
1889b238 186
0168c3d8
KH
187 return bytes;
188}
189
190
33f91981
KH
191/* Return a character whose multibyte form is at P. Set LEN is not
192 NULL, it must be a pointer to integer. In that case, set *LEN to
193 the byte length of the multibyte form. If ADVANCED is not NULL, is
194 must be a pointer to unsigned char. In that case, set *ADVANCED to
195 the ending address (i.e. the starting address of the next
196 character) of the multibyte form. */
197
0168c3d8 198int
971de7fb 199string_char (const unsigned char *p, const unsigned char **advanced, int *len)
0168c3d8 200{
1889b238 201 int c;
15843e6f 202 const unsigned char *saved_p = p;
0168c3d8
KH
203
204 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
205 {
206 c = STRING_CHAR_ADVANCE (p);
207 }
208 else if (! (*p & 0x08))
209 {
210 c = ((((p)[0] & 0xF) << 18)
211 | (((p)[1] & 0x3F) << 12)
212 | (((p)[2] & 0x3F) << 6)
213 | ((p)[3] & 0x3F));
214 p += 4;
215 }
216 else
217 {
218 c = ((((p)[1] & 0x3F) << 18)
219 | (((p)[2] & 0x3F) << 12)
220 | (((p)[3] & 0x3F) << 6)
221 | ((p)[4] & 0x3F));
222 p += 5;
223 }
224
225 MAYBE_UNIFY_CHAR (c);
226
227 if (len)
228 *len = p - saved_p;
229 if (advanced)
230 *advanced = p;
231 return c;
232}
233
234
235/* Translate character C by translation table TABLE. If C is
236 negative, translate a character specified by CHARSET and CODE. If
237 no translation is found in TABLE, return the untranslated
10453be9
KH
238 character. If TABLE is a list, elements are char tables. In this
239 case, translace C by all tables. */
0168c3d8
KH
240
241int
971de7fb 242translate_char (Lisp_Object table, int c)
0168c3d8 243{
10453be9
KH
244 if (CHAR_TABLE_P (table))
245 {
246 Lisp_Object ch;
247
248 ch = CHAR_TABLE_REF (table, c);
249 if (CHARACTERP (ch))
250 c = XINT (ch);
251 }
252 else
253 {
254 for (; CONSP (table); table = XCDR (table))
255 c = translate_char (XCAR (table), c);
256 }
257 return c;
0168c3d8
KH
258}
259
2e5db15c
KH
260/* Convert ASCII or 8-bit character C to unibyte. If C is none of
261 them, return (C & 0xFF).
0168c3d8
KH
262
263 The argument REV_TBL is now ignored. It will be removed in the
264 future. */
265
266int
971de7fb 267multibyte_char_to_unibyte (int c, Lisp_Object rev_tbl)
0168c3d8 268{
2e5db15c
KH
269 if (c < 0x80)
270 return c;
b672c5ae
KH
271 if (CHAR_BYTE8_P (c))
272 return CHAR_TO_BYTE8 (c);
2e5db15c 273 return (c & 0xFF);
0168c3d8
KH
274}
275
935d5b02
KH
276/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
277 by charset_unibyte. */
278
279int
971de7fb 280multibyte_char_to_unibyte_safe (int c)
935d5b02 281{
2e5db15c
KH
282 if (c < 0x80)
283 return c;
935d5b02
KH
284 if (CHAR_BYTE8_P (c))
285 return CHAR_TO_BYTE8 (c);
2e5db15c 286 return -1;
935d5b02 287}
0168c3d8
KH
288
289DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
290 doc: /* Return non-nil if OBJECT is a character. */)
5842a27b 291 (Lisp_Object object, Lisp_Object ignore)
0168c3d8
KH
292{
293 return (CHARACTERP (object) ? Qt : Qnil);
294}
295
296DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
297 doc: /* Return the character of the maximum code. */)
5842a27b 298 (void)
0168c3d8
KH
299{
300 return make_number (MAX_CHAR);
301}
302
303DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
304 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 305 doc: /* Convert the byte CH to multibyte character. */)
5842a27b 306 (Lisp_Object ch)
0168c3d8
KH
307{
308 int c;
0168c3d8
KH
309
310 CHECK_CHARACTER (ch);
311 c = XFASTINT (ch);
2e5db15c
KH
312 if (c >= 0x100)
313 error ("Not a unibyte character: %d", c);
4c0354d7 314 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
315 return make_number (c);
316}
317
318DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
319 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
320 doc: /* Convert the multibyte character CH to a byte.
321If the multibyte character does not represent a byte, return -1. */)
5842a27b 322 (Lisp_Object ch)
0168c3d8 323{
5556875b 324 int cm;
0168c3d8
KH
325
326 CHECK_CHARACTER (ch);
5556875b
SM
327 cm = XFASTINT (ch);
328 if (cm < 256)
329 /* Can't distinguish a byte read from a unibyte buffer from
330 a latin1 char, so let's let it slide. */
331 return ch;
332 else
333 {
2afc21f5 334 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
335 return make_number (cu);
336 }
0168c3d8
KH
337}
338
339DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
340 doc: /* Return 1 regardless of the argument CHAR.
add553ac
JB
341This is now an obsolete function. We keep it just for backward compatibility.
342usage: (char-bytes CHAR) */)
5842a27b 343 (Lisp_Object ch)
0168c3d8
KH
344{
345 CHECK_CHARACTER (ch);
346 return make_number (1);
347}
348
349DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
350 doc: /* Return width of CHAR when displayed in the current buffer.
351The width is measured by how many columns it occupies on the screen.
add553ac
JB
352Tab is taken to occupy `tab-width' columns.
353usage: (char-width CHAR) */)
5842a27b 354 (Lisp_Object ch)
0168c3d8
KH
355{
356 Lisp_Object disp;
357 int c, width;
358 struct Lisp_Char_Table *dp = buffer_display_table ();
359
360 CHECK_CHARACTER (ch);
361 c = XINT (ch);
362
363 /* Get the way the display table would display it. */
364 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
365
366 if (VECTORP (disp))
367 width = ASIZE (disp);
368 else
369 width = CHAR_WIDTH (c);
370
371 return make_number (width);
372}
373
0168c3d8
KH
374/* Return width of string STR of length LEN when displayed in the
375 current buffer. The width is measured by how many columns it
376 occupies on the screen. If PRECISION > 0, return the width of
377 longest substring that doesn't exceed PRECISION, and set number of
378 characters and bytes of the substring in *NCHARS and *NBYTES
379 respectively. */
380
579c18d0
EZ
381EMACS_INT
382c_string_width (const unsigned char *str, EMACS_INT len, int precision,
383 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 384{
579c18d0
EZ
385 EMACS_INT i = 0, i_byte = 0;
386 EMACS_INT width = 0;
0168c3d8
KH
387 struct Lisp_Char_Table *dp = buffer_display_table ();
388
389 while (i_byte < len)
390 {
391 int bytes, thiswidth;
392 Lisp_Object val;
62a6e103 393 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
394
395 if (dp)
396 {
397 val = DISP_CHAR_VECTOR (dp, c);
398 if (VECTORP (val))
399 thiswidth = XVECTOR (val)->size;
400 else
401 thiswidth = CHAR_WIDTH (c);
402 }
403 else
404 {
405 thiswidth = CHAR_WIDTH (c);
406 }
407
408 if (precision > 0
409 && (width + thiswidth > precision))
410 {
411 *nchars = i;
412 *nbytes = i_byte;
413 return width;
414 }
415 i++;
416 i_byte += bytes;
417 width += thiswidth;
418 }
419
420 if (precision > 0)
421 {
422 *nchars = i;
423 *nbytes = i_byte;
424 }
425
426 return width;
427}
428
1889b238
KH
429/* Return width of string STR of length LEN when displayed in the
430 current buffer. The width is measured by how many columns it
431 occupies on the screen. */
432
579c18d0
EZ
433EMACS_INT
434strwidth (const unsigned char *str, EMACS_INT len)
1889b238
KH
435{
436 return c_string_width (str, len, -1, NULL, NULL);
437}
438
0168c3d8
KH
439/* Return width of Lisp string STRING when displayed in the current
440 buffer. The width is measured by how many columns it occupies on
441 the screen while paying attention to compositions. If PRECISION >
442 0, return the width of longest substring that doesn't exceed
443 PRECISION, and set number of characters and bytes of the substring
444 in *NCHARS and *NBYTES respectively. */
445
579c18d0
EZ
446EMACS_INT
447lisp_string_width (Lisp_Object string, int precision,
448 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 449{
579c18d0 450 EMACS_INT len = SCHARS (string);
0aee65b9
KH
451 /* This set multibyte to 0 even if STRING is multibyte when it
452 contains only ascii and eight-bit-graphic, but that's
453 intentional. */
454 int multibyte = len < SBYTES (string);
8f924df7 455 unsigned char *str = SDATA (string);
579c18d0
EZ
456 EMACS_INT i = 0, i_byte = 0;
457 EMACS_INT width = 0;
0168c3d8
KH
458 struct Lisp_Char_Table *dp = buffer_display_table ();
459
460 while (i < len)
461 {
462 int chars, bytes, thiswidth;
463 Lisp_Object val;
464 int cmp_id;
f4bc0685 465 EMACS_INT ignore, end;
0168c3d8
KH
466
467 if (find_composition (i, -1, &ignore, &end, &val, string)
468 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
469 >= 0))
470 {
471 thiswidth = composition_table[cmp_id]->width;
472 chars = end - i;
473 bytes = string_char_to_byte (string, end) - i_byte;
474 }
0168c3d8
KH
475 else
476 {
0aee65b9 477 int c;
0168c3d8 478
0aee65b9 479 if (multibyte)
62a6e103 480 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0aee65b9
KH
481 else
482 c = str[i_byte], bytes = 1;
0168c3d8 483 chars = 1;
0aee65b9
KH
484 if (dp)
485 {
486 val = DISP_CHAR_VECTOR (dp, c);
487 if (VECTORP (val))
488 thiswidth = XVECTOR (val)->size;
489 else
490 thiswidth = CHAR_WIDTH (c);
491 }
492 else
493 {
494 thiswidth = CHAR_WIDTH (c);
495 }
0168c3d8
KH
496 }
497
498 if (precision > 0
499 && (width + thiswidth > precision))
500 {
501 *nchars = i;
502 *nbytes = i_byte;
503 return width;
504 }
505 i += chars;
506 i_byte += bytes;
507 width += thiswidth;
508 }
509
510 if (precision > 0)
511 {
512 *nchars = i;
513 *nbytes = i_byte;
514 }
515
516 return width;
517}
518
519DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
520 doc: /* Return width of STRING when displayed in the current buffer.
521Width is measured by how many columns it occupies on the screen.
522When calculating width of a multibyte character in STRING,
523only the base leading-code is considered; the validity of
524the following bytes is not checked. Tabs in STRING are always
add553ac
JB
525taken to occupy `tab-width' columns.
526usage: (string-width STRING) */)
5842a27b 527 (Lisp_Object str)
0168c3d8
KH
528{
529 Lisp_Object val;
530
531 CHECK_STRING (str);
532 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
533 return val;
534}
535
536DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
537 doc: /* Return the direction of CHAR.
add553ac
JB
538The returned value is 0 for left-to-right and 1 for right-to-left.
539usage: (char-direction CHAR) */)
5842a27b 540 (Lisp_Object ch)
0168c3d8
KH
541{
542 int c;
543
544 CHECK_CHARACTER (ch);
545 c = XINT (ch);
546 return CHAR_TABLE_REF (Vchar_direction_table, c);
547}
548
0168c3d8
KH
549/* Return the number of characters in the NBYTES bytes at PTR.
550 This works by looking at the contents and checking for multibyte
551 sequences while assuming that there's no invalid sequence.
552 However, if the current buffer has enable-multibyte-characters =
553 nil, we treat each byte as a character. */
554
13818c30 555EMACS_INT
971de7fb 556chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8
KH
557{
558 /* current_buffer is null at early stages of Emacs initialization. */
559 if (current_buffer == 0
560 || NILP (current_buffer->enable_multibyte_characters))
561 return nbytes;
562
563 return multibyte_chars_in_text (ptr, nbytes);
564}
565
566/* Return the number of characters in the NBYTES bytes at PTR.
567 This works by looking at the contents and checking for multibyte
568 sequences while assuming that there's no invalid sequence. It
569 ignores enable-multibyte-characters. */
570
13818c30 571EMACS_INT
971de7fb 572multibyte_chars_in_text (const unsigned char *ptr, EMACS_INT nbytes)
0168c3d8 573{
8f924df7 574 const unsigned char *endp = ptr + nbytes;
579c18d0 575 EMACS_INT chars = 0;
0168c3d8
KH
576
577 while (ptr < endp)
578 {
579c18d0 579 EMACS_INT len = MULTIBYTE_LENGTH (ptr, endp);
0168c3d8
KH
580
581 if (len == 0)
582 abort ();
583 ptr += len;
584 chars++;
585 }
586
587 return chars;
588}
589
590/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
591 characters and bytes in it, and store them in *NCHARS and *NBYTES
592 respectively. On counting bytes, pay attention to that 8-bit
593 characters not constructing a valid multibyte sequence are
594 represented by 2-byte in a multibyte text. */
595
596void
14162469
EZ
597parse_str_as_multibyte (const unsigned char *str, EMACS_INT len,
598 EMACS_INT *nchars, EMACS_INT *nbytes)
0168c3d8 599{
8f924df7 600 const unsigned char *endp = str + len;
14162469 601 EMACS_INT n, chars = 0, bytes = 0;
0168c3d8
KH
602
603 if (len >= MAX_MULTIBYTE_LENGTH)
604 {
8f924df7 605 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
606 while (str < adjusted_endp)
607 {
fc9a17bc
KH
608 if (! CHAR_BYTE8_HEAD_P (*str)
609 && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
0168c3d8
KH
610 str += n, bytes += n;
611 else
612 str++, bytes += 2;
613 chars++;
614 }
615 }
616 while (str < endp)
617 {
fc9a17bc
KH
618 if (! CHAR_BYTE8_HEAD_P (*str)
619 && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
0168c3d8
KH
620 str += n, bytes += n;
621 else
622 str++, bytes += 2;
623 chars++;
624 }
625
626 *nchars = chars;
627 *nbytes = bytes;
628 return;
629}
630
631/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
632 It actually converts only such 8-bit characters that don't contruct
633 a multibyte sequence to multibyte forms of Latin-1 characters. If
634 NCHARS is nonzero, set *NCHARS to the number of characters in the
635 text. It is assured that we can use LEN bytes at STR as a work
636 area and that is enough. Return the number of bytes of the
637 resulting text. */
638
14162469
EZ
639EMACS_INT
640str_as_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT nbytes,
641 EMACS_INT *nchars)
0168c3d8
KH
642{
643 unsigned char *p = str, *endp = str + nbytes;
644 unsigned char *to;
14162469 645 EMACS_INT chars = 0;
0168c3d8
KH
646 int n;
647
648 if (nbytes >= MAX_MULTIBYTE_LENGTH)
649 {
650 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
651 while (p < adjusted_endp
fc9a17bc 652 && ! CHAR_BYTE8_HEAD_P (*p)
0168c3d8
KH
653 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
654 p += n, chars++;
655 }
fc9a17bc
KH
656 while (p < endp
657 && ! CHAR_BYTE8_HEAD_P (*p)
658 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
659 p += n, chars++;
660 if (nchars)
661 *nchars = chars;
662 if (p == endp)
663 return nbytes;
664
665 to = p;
666 nbytes = endp - p;
667 endp = str + len;
72af86bd 668 memmove (endp - nbytes, p, nbytes);
0168c3d8
KH
669 p = endp - nbytes;
670
671 if (nbytes >= MAX_MULTIBYTE_LENGTH)
672 {
673 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
674 while (p < adjusted_endp)
675 {
fc9a17bc
KH
676 if (! CHAR_BYTE8_HEAD_P (*p)
677 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
0168c3d8
KH
678 {
679 while (n--)
680 *to++ = *p++;
681 }
682 else
683 {
684 int c = *p++;
685 c = BYTE8_TO_CHAR (c);
686 to += CHAR_STRING (c, to);
687 }
688 }
689 chars++;
690 }
691 while (p < endp)
692 {
fc9a17bc
KH
693 if (! CHAR_BYTE8_HEAD_P (*p)
694 && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
0168c3d8
KH
695 {
696 while (n--)
697 *to++ = *p++;
8f924df7 698 }
0168c3d8
KH
699 else
700 {
701 int c = *p++;
702 c = BYTE8_TO_CHAR (c);
703 to += CHAR_STRING (c, to);
704 }
705 chars++;
706 }
707 if (nchars)
708 *nchars = chars;
709 return (to - str);
710}
711
712/* Parse unibyte string at STR of LEN bytes, and return the number of
713 bytes it may ocupy when converted to multibyte string by
714 `str_to_multibyte'. */
715
14162469
EZ
716EMACS_INT
717parse_str_to_multibyte (const unsigned char *str, EMACS_INT len)
0168c3d8 718{
eec47d6b 719 const unsigned char *endp = str + len;
14162469 720 EMACS_INT bytes;
0168c3d8
KH
721
722 for (bytes = 0; str < endp; str++)
723 bytes += (*str < 0x80) ? 1 : 2;
724 return bytes;
725}
726
727
728/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
729 that contains the same single-byte characters. It actually
730 converts all 8-bit characters to multibyte forms. It is assured
731 that we can use LEN bytes at STR as a work area and that is
732 enough. */
733
14162469
EZ
734EMACS_INT
735str_to_multibyte (unsigned char *str, EMACS_INT len, EMACS_INT bytes)
0168c3d8
KH
736{
737 unsigned char *p = str, *endp = str + bytes;
738 unsigned char *to;
739
740 while (p < endp && *p < 0x80) p++;
741 if (p == endp)
742 return bytes;
743 to = p;
744 bytes = endp - p;
745 endp = str + len;
72af86bd 746 memmove (endp - bytes, p, bytes);
0168c3d8 747 p = endp - bytes;
8f924df7 748 while (p < endp)
0168c3d8
KH
749 {
750 int c = *p++;
751
752 if (c >= 0x80)
753 c = BYTE8_TO_CHAR (c);
754 to += CHAR_STRING (c, to);
755 }
756 return (to - str);
757}
758
759/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
760 actually converts characters in the range 0x80..0xFF to
761 unibyte. */
762
14162469
EZ
763EMACS_INT
764str_as_unibyte (unsigned char *str, EMACS_INT bytes)
0168c3d8 765{
15843e6f
KH
766 const unsigned char *p = str, *endp = str + bytes;
767 unsigned char *to;
0168c3d8
KH
768 int c, len;
769
770 while (p < endp)
771 {
772 c = *p;
773 len = BYTES_BY_CHAR_HEAD (c);
774 if (CHAR_BYTE8_HEAD_P (c))
775 break;
776 p += len;
777 }
15843e6f 778 to = str + (p - str);
8f924df7 779 while (p < endp)
0168c3d8
KH
780 {
781 c = *p;
782 len = BYTES_BY_CHAR_HEAD (c);
783 if (CHAR_BYTE8_HEAD_P (c))
784 {
785 c = STRING_CHAR_ADVANCE (p);
786 *to++ = CHAR_TO_BYTE8 (c);
787 }
788 else
789 {
790 while (len--) *to++ = *p++;
791 }
792 }
793 return (to - str);
794}
795
4aa40bb8
KH
796/* Convert eight-bit chars in SRC (in multibyte form) to the
797 corresponding byte and store in DST. CHARS is the number of
798 characters in SRC. The value is the number of bytes stored in DST.
799 Usually, the value is the same as CHARS, but is less than it if SRC
800 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
801 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
802 of that character code.
803 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
804
805EMACS_INT
971de7fb 806str_to_unibyte (const unsigned char *src, unsigned char *dst, EMACS_INT chars, int accept_latin_1)
4aa40bb8
KH
807{
808 EMACS_INT i;
809
810 for (i = 0; i < chars; i++)
811 {
812 int c = STRING_CHAR_ADVANCE (src);
813
814 if (CHAR_BYTE8_P (c))
815 c = CHAR_TO_BYTE8 (c);
816 else if (! ASCII_CHAR_P (c)
817 && (! accept_latin_1 || c >= 0x100))
818 return i;
819 *dst++ = c;
820 }
821 return i;
822}
823
824
14162469 825EMACS_INT
971de7fb 826string_count_byte8 (Lisp_Object string)
0168c3d8
KH
827{
828 int multibyte = STRING_MULTIBYTE (string);
14162469 829 EMACS_INT nbytes = SBYTES (string);
8f924df7 830 unsigned char *p = SDATA (string);
0168c3d8 831 unsigned char *pend = p + nbytes;
14162469 832 EMACS_INT count = 0;
0168c3d8
KH
833 int c, len;
834
835 if (multibyte)
836 while (p < pend)
837 {
838 c = *p;
839 len = BYTES_BY_CHAR_HEAD (c);
840
841 if (CHAR_BYTE8_HEAD_P (c))
842 count++;
843 p += len;
844 }
845 else
846 while (p < pend)
847 {
848 if (*p++ >= 0x80)
849 count++;
850 }
851 return count;
852}
853
854
855Lisp_Object
971de7fb 856string_escape_byte8 (Lisp_Object string)
0168c3d8 857{
14162469
EZ
858 EMACS_INT nchars = SCHARS (string);
859 EMACS_INT nbytes = SBYTES (string);
0168c3d8 860 int multibyte = STRING_MULTIBYTE (string);
14162469 861 EMACS_INT byte8_count;
15843e6f
KH
862 const unsigned char *src, *src_end;
863 unsigned char *dst;
0168c3d8
KH
864 Lisp_Object val;
865 int c, len;
866
867 if (multibyte && nchars == nbytes)
868 return string;
869
870 byte8_count = string_count_byte8 (string);
871
872 if (byte8_count == 0)
873 return string;
874
875 if (multibyte)
14162469
EZ
876 {
877 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
878 || (MOST_POSITIVE_FIXNUM - nbytes) / 2 < byte8_count)
879 error ("Maximum string size exceeded");
880
881 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
882 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
883 nbytes + byte8_count * 2);
884 }
0168c3d8 885 else
14162469
EZ
886 {
887 if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count)
888 error ("Maximum string size exceeded");
889 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
890 val = make_uninit_string (nbytes + byte8_count * 3);
891 }
0168c3d8 892
8f924df7 893 src = SDATA (string);
0168c3d8 894 src_end = src + nbytes;
8f924df7 895 dst = SDATA (val);
0168c3d8
KH
896 if (multibyte)
897 while (src < src_end)
898 {
899 c = *src;
900 len = BYTES_BY_CHAR_HEAD (c);
901
902 if (CHAR_BYTE8_HEAD_P (c))
903 {
904 c = STRING_CHAR_ADVANCE (src);
905 c = CHAR_TO_BYTE8 (c);
1889b238 906 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
907 dst += 4;
908 }
909 else
910 while (len--) *dst++ = *src++;
911 }
912 else
913 while (src < src_end)
914 {
915 c = *src++;
916 if (c >= 0x80)
917 {
1889b238 918 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
919 dst += 4;
920 }
921 else
922 *dst++ = c;
923 }
924 return val;
925}
926
927\f
8f924df7 928DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 929 doc: /*
d2e83296
DL
930Concatenate all the argument characters and make the result a string.
931usage: (string &rest CHARACTERS) */)
5842a27b 932 (int n, Lisp_Object *args)
0168c3d8 933{
754790b6
CY
934 int i, c;
935 unsigned char *buf, *p;
936 Lisp_Object str;
937 USE_SAFE_ALLOCA;
938
939 SAFE_ALLOCA (buf, unsigned char *, MAX_MULTIBYTE_LENGTH * n);
940 p = buf;
0168c3d8
KH
941
942 for (i = 0; i < n; i++)
943 {
944 CHECK_CHARACTER (args[i]);
945 c = XINT (args[i]);
946 p += CHAR_STRING (c, p);
947 }
948
754790b6
CY
949 str = make_string_from_bytes ((char *) buf, n, p - buf);
950 SAFE_FREE ();
951 return str;
0168c3d8
KH
952}
953
70b4969d 954DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
955 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
956usage: (unibyte-string &rest BYTES) */)
5842a27b 957 (int n, Lisp_Object *args)
70b4969d 958{
754790b6
CY
959 int i, c;
960 unsigned char *buf, *p;
961 Lisp_Object str;
962 USE_SAFE_ALLOCA;
963
964 SAFE_ALLOCA (buf, unsigned char *, n);
965 p = buf;
70b4969d
KH
966
967 for (i = 0; i < n; i++)
968 {
969 CHECK_NATNUM (args[i]);
970 c = XFASTINT (args[i]);
971 if (c >= 256)
972 args_out_of_range_3 (args[i], make_number (0), make_number (255));
973 *p++ = c;
974 }
975
754790b6
CY
976 str = make_string_from_bytes ((char *) buf, n, p - buf);
977 SAFE_FREE ();
978 return str;
70b4969d
KH
979}
980
c73ae4ae 981DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
982 Schar_resolve_modifiers, 1, 1, 0,
983 doc: /* Resolve modifiers in the character CHAR.
984The value is a character with modifiers resolved into the character
985code. Unresolved modifiers are kept in the value.
c73ae4ae 986usage: (char-resolve-modifiers CHAR) */)
5842a27b 987 (Lisp_Object character)
d0363d44
KH
988{
989 int c;
990
991 CHECK_NUMBER (character);
992 c = XINT (character);
993 return make_number (char_resolve_modifier_mask (c));
994}
995
ee107a89
KH
996DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
997 doc: /* Return a byte value of a character at point.
998Optional 1st arg POSITION, if non-nil, is a position of a character to get
999a byte value.
1000Optional 2nd arg STRING, if non-nil, is a string of which first
1001character is a target to get a byte value. In this case, POSITION, if
1002non-nil, is an index of a target character in the string.
1003
1004If the current buffer (or STRING) is multibyte, and the target
1005character is not ASCII nor 8-bit character, an error is signalled. */)
5842a27b 1006 (Lisp_Object position, Lisp_Object string)
ee107a89
KH
1007{
1008 int c;
1009 EMACS_INT pos;
1010 unsigned char *p;
1011
1012 if (NILP (string))
1013 {
1014 if (NILP (position))
1015 {
1016 p = PT_ADDR;
4d8e170e 1017 }
ee107a89
KH
1018 else
1019 {
1020 CHECK_NUMBER_COERCE_MARKER (position);
1021 if (XINT (position) < BEGV || XINT (position) >= ZV)
1022 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1023 pos = XFASTINT (position);
1024 p = CHAR_POS_ADDR (pos);
1025 }
d5998e03
KH
1026 if (NILP (current_buffer->enable_multibyte_characters))
1027 return make_number (*p);
ee107a89
KH
1028 }
1029 else
1030 {
1031 CHECK_STRING (string);
1032 if (NILP (position))
1033 {
1034 p = SDATA (string);
1035 }
1036 else
1037 {
1038 CHECK_NATNUM (position);
1039 if (XINT (position) >= SCHARS (string))
1040 args_out_of_range (string, position);
1041 pos = XFASTINT (position);
1042 p = SDATA (string) + string_char_to_byte (string, pos);
1043 }
d5998e03
KH
1044 if (! STRING_MULTIBYTE (string))
1045 return make_number (*p);
ee107a89 1046 }
62a6e103 1047 c = STRING_CHAR (p);
ee107a89
KH
1048 if (CHAR_BYTE8_P (c))
1049 c = CHAR_TO_BYTE8 (c);
1050 else if (! ASCII_CHAR_P (c))
1051 error ("Not an ASCII nor an 8-bit character: %d", c);
1052 return make_number (c);
1053}
1054
1055
0168c3d8 1056void
971de7fb 1057init_character_once (void)
0168c3d8
KH
1058{
1059}
1060
1061#ifdef emacs
1062
1063void
971de7fb 1064syms_of_character (void)
0168c3d8
KH
1065{
1066 DEFSYM (Qcharacterp, "characterp");
1067 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1068
1069 staticpro (&Vchar_unify_table);
1070 Vchar_unify_table = Qnil;
1071
1072 defsubr (&Smax_char);
1073 defsubr (&Scharacterp);
1074 defsubr (&Sunibyte_char_to_multibyte);
1075 defsubr (&Smultibyte_char_to_unibyte);
1076 defsubr (&Schar_bytes);
1077 defsubr (&Schar_width);
1078 defsubr (&Sstring_width);
1079 defsubr (&Schar_direction);
0168c3d8 1080 defsubr (&Sstring);
70b4969d 1081 defsubr (&Sunibyte_string);
d0363d44 1082 defsubr (&Schar_resolve_modifiers);
ee107a89 1083 defsubr (&Sget_byte);
0168c3d8
KH
1084
1085 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1086 doc: /*
68978cf0
DL
1087Vector recording all translation tables ever defined.
1088Each element is a pair (SYMBOL . TABLE) relating the table to the
1089symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1090 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1091
1092 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1093 doc: /*
1094A char-table for characters which invoke auto-filling.
1095Such characters have value t in this table. */);
1096 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1097 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1098 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
1099
1100 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1101 doc: /*
1102A char-table for width (columns) of each character. */);
1103 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1104 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1105 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1106 make_number (4));
0168c3d8
KH
1107
1108 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1109 doc: /* A char-table for direction of each character. */);
1110 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1111
1112 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1113 doc: /* A char-table for each printable character. */);
db6d4189 1114 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1115 Fset_char_table_range (Vprintable_chars,
1116 Fcons (make_number (32), make_number (126)), Qt);
1117 Fset_char_table_range (Vprintable_chars,
1118 Fcons (make_number (160),
1119 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1120
c57f3328
KH
1121 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1122 doc: /* Char table of script symbols.
1123It has one extra slot whose value is a list of script symbols. */);
1124
1125 /* Intern this now in case it isn't already done.
1126 Setting this variable twice is harmless.
1127 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1128 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1129 DEFSYM (Qchar_script_table, "char-script-table");
1130 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1131 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352
KH
1132
1133 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
c3bb7671 1134 doc: /* Alist of scripts vs the representative characters.
f4427a54 1135Each element is a cons (SCRIPT . CHARS).
4d8e170e 1136SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1137CHARS is a list or a vector of characters.
472a4dc9 1138If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1139If it is a vector, one of the characters in the vector is necessary.
1140This variable is used to find a font for a specific script. */);
c7e14352 1141 Vscript_representative_chars = Qnil;
a3cbb631
KH
1142
1143 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1144 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1145All Unicode characters have one of the following values (symbol):
1146 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1147 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1148See The Unicode Standard for the meaning of those values. */);
1149 /* The correct char-table is setup in characters.el. */
1150 Vunicode_category_table = Qnil;
0168c3d8
KH
1151}
1152
1153#endif /* emacs */
fbaf0946
MB
1154
1155/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1156 (do not change this comment) */