Update for font-backend changes.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
ec62e0ac
GM
4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 Free Software Foundation, Inc.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
0168c3d8
KH
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10This file is part of GNU Emacs.
11
12GNU Emacs is free software; you can redistribute it and/or modify
13it under the terms of the GNU General Public License as published by
ec62e0ac 14the Free Software Foundation; either version 3, or (at your option)
0168c3d8
KH
15any later version.
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
ec62e0ac
GM
23along with GNU Emacs; see the file COPYING. If not, write to the
24Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25Boston, MA 02110-1301, USA. */
0168c3d8
KH
26
27/* At first, see the document in `character.h' to understand the code
28 in this file. */
29
30#ifdef emacs
31#include <config.h>
32#endif
33
34#include <stdio.h>
35
36#ifdef emacs
37
38#include <sys/types.h>
39#include "lisp.h"
40#include "character.h"
41#include "buffer.h"
42#include "charset.h"
43#include "composite.h"
44#include "disptab.h"
45
46#else /* not emacs */
47
48#include "mulelib.h"
49
50#endif /* emacs */
51
52Lisp_Object Qcharacterp;
53
54/* Vector of translation table ever defined.
55 ID of a translation table is used to index this vector. */
56Lisp_Object Vtranslation_table_vector;
57
58/* A char-table for characters which may invoke auto-filling. */
59Lisp_Object Vauto_fill_chars;
60
61Lisp_Object Qauto_fill_chars;
62
33f91981
KH
63/* Char-table of information about which character to unify to which
64 Unicode character. */
0168c3d8
KH
65Lisp_Object Vchar_unify_table;
66
67/* A char-table. An element is non-nil iff the corresponding
68 character has a printable glyph. */
69Lisp_Object Vprintable_chars;
70
71/* A char-table. An elemnent is a column-width of the corresponding
72 character. */
73Lisp_Object Vchar_width_table;
74
75/* A char-table. An element is a symbol indicating the direction
76 property of corresponding character. */
77Lisp_Object Vchar_direction_table;
78
8973478b 79/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 80unsigned char *_fetch_multibyte_char_p;
0168c3d8 81
c57f3328
KH
82/* Char table of scripts. */
83Lisp_Object Vchar_script_table;
84
c7e14352
KH
85/* Alist of scripts vs representative characters. */
86Lisp_Object Vscript_representative_chars;
87
c57f3328
KH
88static Lisp_Object Qchar_script_table;
89
b672c5ae
KH
90/* Mapping table from unibyte chars to multibyte chars. */
91int unibyte_to_multibyte_table[256];
15843e6f 92
90c9d035
KH
93/* Nth element is 1 iff unibyte char N can be mapped to a multibyte
94 char. */
95char unibyte_has_multibyte_table[256];
96
0168c3d8
KH
97\f
98
2bde7652
KH
99/* If character code C has modifier masks, reflect them to the
100 character code if possible. Return the resulting code. */
101
102int
103char_resolve_modifier_mask (c)
104 int c;
105{
d0363d44 106 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
107 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
108 return c;
109
110 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
111 if (c & CHAR_SHIFT)
112 {
113 /* Shift modifier is valid only with [A-Za-z]. */
114 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
115 c &= ~CHAR_SHIFT;
116 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
117 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
118 /* Shift modifier for control characters and SPC is ignored. */
119 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
120 c &= ~CHAR_SHIFT;
121 }
2bde7652
KH
122 if (c & CHAR_CTL)
123 {
124 /* Simulate the code in lread.c. */
125 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
126 if ((c & 0377) == ' ')
127 c &= ~0177 & ~ CHAR_CTL;
128 else if ((c & 0377) == '?')
129 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
130 /* ASCII control chars are made from letters (both cases),
131 as well as the non-letters within 0100...0137. */
132 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
133 c &= (037 | (~0177 & ~CHAR_CTL));
134 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
135 c &= (037 | (~0177 & ~CHAR_CTL));
136 }
03365d0e
KH
137 if (c & CHAR_META)
138 {
139 /* Move the meta bit to the right place for a string. */
140 c = (c & ~CHAR_META) | 0x80;
141 }
2bde7652
KH
142
143 return c;
144}
145
146
33f91981
KH
147/* Store multibyte form of character C at P. If C has modifier bits,
148 handle them appropriately. */
149
0168c3d8 150int
e3d8eb8c 151char_string (c, p)
5aa91c9b 152 unsigned c;
1889b238 153 unsigned char *p;
0168c3d8
KH
154{
155 int bytes;
156
e3d8eb8c
KH
157 if (c & CHAR_MODIFIER_MASK)
158 {
2bde7652 159 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
160 /* If C still has any modifier bits, just ignore it. */
161 c &= ~CHAR_MODIFIER_MASK;
162 }
163
0168c3d8
KH
164 MAYBE_UNIFY_CHAR (c);
165
e3d8eb8c 166 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
167 {
168 bytes = CHAR_STRING (c, p);
169 }
170 else if (c <= MAX_4_BYTE_CHAR)
171 {
172 p[0] = (0xF0 | (c >> 18));
173 p[1] = (0x80 | ((c >> 12) & 0x3F));
174 p[2] = (0x80 | ((c >> 6) & 0x3F));
175 p[3] = (0x80 | (c & 0x3F));
176 bytes = 4;
177 }
e3d8eb8c 178 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
179 {
180 p[0] = 0xF8;
181 p[1] = (0x80 | ((c >> 18) & 0x0F));
182 p[2] = (0x80 | ((c >> 12) & 0x3F));
183 p[3] = (0x80 | ((c >> 6) & 0x3F));
184 p[4] = (0x80 | (c & 0x3F));
185 bytes = 5;
186 }
5aa91c9b 187 else if (c <= MAX_CHAR)
e3d8eb8c
KH
188 {
189 c = CHAR_TO_BYTE8 (c);
190 bytes = BYTE8_STRING (c, p);
191 }
5aa91c9b
KH
192 else
193 error ("Invalid character: %d", c);
1889b238 194
0168c3d8
KH
195 return bytes;
196}
197
198
33f91981
KH
199/* Return a character whose multibyte form is at P. Set LEN is not
200 NULL, it must be a pointer to integer. In that case, set *LEN to
201 the byte length of the multibyte form. If ADVANCED is not NULL, is
202 must be a pointer to unsigned char. In that case, set *ADVANCED to
203 the ending address (i.e. the starting address of the next
204 character) of the multibyte form. */
205
0168c3d8 206int
e3d8eb8c 207string_char (p, advanced, len)
15843e6f
KH
208 const unsigned char *p;
209 const unsigned char **advanced;
0168c3d8
KH
210 int *len;
211{
1889b238 212 int c;
15843e6f 213 const unsigned char *saved_p = p;
0168c3d8
KH
214
215 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
216 {
217 c = STRING_CHAR_ADVANCE (p);
218 }
219 else if (! (*p & 0x08))
220 {
221 c = ((((p)[0] & 0xF) << 18)
222 | (((p)[1] & 0x3F) << 12)
223 | (((p)[2] & 0x3F) << 6)
224 | ((p)[3] & 0x3F));
225 p += 4;
226 }
227 else
228 {
229 c = ((((p)[1] & 0x3F) << 18)
230 | (((p)[2] & 0x3F) << 12)
231 | (((p)[3] & 0x3F) << 6)
232 | ((p)[4] & 0x3F));
233 p += 5;
234 }
235
236 MAYBE_UNIFY_CHAR (c);
237
238 if (len)
239 *len = p - saved_p;
240 if (advanced)
241 *advanced = p;
242 return c;
243}
244
245
246/* Translate character C by translation table TABLE. If C is
247 negative, translate a character specified by CHARSET and CODE. If
248 no translation is found in TABLE, return the untranslated
10453be9
KH
249 character. If TABLE is a list, elements are char tables. In this
250 case, translace C by all tables. */
0168c3d8
KH
251
252int
253translate_char (table, c)
254 Lisp_Object table;
255 int c;
256{
10453be9
KH
257 if (CHAR_TABLE_P (table))
258 {
259 Lisp_Object ch;
260
261 ch = CHAR_TABLE_REF (table, c);
262 if (CHARACTERP (ch))
263 c = XINT (ch);
264 }
265 else
266 {
267 for (; CONSP (table); table = XCDR (table))
268 c = translate_char (XCAR (table), c);
269 }
270 return c;
0168c3d8
KH
271}
272
0168c3d8 273/* Convert the multibyte character C to unibyte 8-bit character based
ac86488b
KH
274 on the current value of charset_unibyte. If dimension of
275 charset_unibyte is more than one, return (C & 0xFF).
0168c3d8
KH
276
277 The argument REV_TBL is now ignored. It will be removed in the
278 future. */
279
280int
281multibyte_char_to_unibyte (c, rev_tbl)
282 int c;
283 Lisp_Object rev_tbl;
284{
b672c5ae
KH
285 struct charset *charset;
286 unsigned c1;
0168c3d8 287
b672c5ae
KH
288 if (CHAR_BYTE8_P (c))
289 return CHAR_TO_BYTE8 (c);
290 charset = CHARSET_FROM_ID (charset_unibyte);
291 c1 = ENCODE_CHAR (charset, c);
0168c3d8
KH
292 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
293}
294
935d5b02
KH
295/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
296 by charset_unibyte. */
297
298int
299multibyte_char_to_unibyte_safe (c)
300 int c;
301{
302 struct charset *charset;
303 unsigned c1;
304
305 if (CHAR_BYTE8_P (c))
306 return CHAR_TO_BYTE8 (c);
307 charset = CHARSET_FROM_ID (charset_unibyte);
308 c1 = ENCODE_CHAR (charset, c);
309 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
310}
0168c3d8
KH
311
312DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
313 doc: /* Return non-nil if OBJECT is a character. */)
314 (object, ignore)
315 Lisp_Object object, ignore;
316{
317 return (CHARACTERP (object) ? Qt : Qnil);
318}
319
320DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
321 doc: /* Return the character of the maximum code. */)
322 ()
323{
324 return make_number (MAX_CHAR);
325}
326
327DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
328 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 329 doc: /* Convert the byte CH to multibyte character. */)
0168c3d8
KH
330 (ch)
331 Lisp_Object ch;
332{
333 int c;
334 struct charset *charset;
335
336 CHECK_CHARACTER (ch);
337 c = XFASTINT (ch);
338 if (c >= 0400)
339 error ("Invalid unibyte character: %d", c);
ac86488b 340 charset = CHARSET_FROM_ID (charset_unibyte);
0168c3d8
KH
341 c = DECODE_CHAR (charset, c);
342 if (c < 0)
3c5a53bd 343 c = BYTE8_TO_CHAR (XFASTINT (ch));
0168c3d8
KH
344 return make_number (c);
345}
346
347DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
348 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
349 doc: /* Convert the multibyte character CH to a byte.
350If the multibyte character does not represent a byte, return -1. */)
0168c3d8
KH
351 (ch)
352 Lisp_Object ch;
353{
5556875b 354 int cm;
0168c3d8
KH
355
356 CHECK_CHARACTER (ch);
5556875b
SM
357 cm = XFASTINT (ch);
358 if (cm < 256)
359 /* Can't distinguish a byte read from a unibyte buffer from
360 a latin1 char, so let's let it slide. */
361 return ch;
362 else
363 {
364 int cu = CHAR_TO_BYTE8 (cm);
365 return make_number (cu);
366 }
0168c3d8
KH
367}
368
369DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
370 doc: /* Return 1 regardless of the argument CHAR.
add553ac
JB
371This is now an obsolete function. We keep it just for backward compatibility.
372usage: (char-bytes CHAR) */)
0168c3d8
KH
373 (ch)
374 Lisp_Object ch;
375{
376 CHECK_CHARACTER (ch);
377 return make_number (1);
378}
379
380DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
381 doc: /* Return width of CHAR when displayed in the current buffer.
382The width is measured by how many columns it occupies on the screen.
add553ac
JB
383Tab is taken to occupy `tab-width' columns.
384usage: (char-width CHAR) */)
0168c3d8
KH
385 (ch)
386 Lisp_Object ch;
387{
388 Lisp_Object disp;
389 int c, width;
390 struct Lisp_Char_Table *dp = buffer_display_table ();
391
392 CHECK_CHARACTER (ch);
393 c = XINT (ch);
394
395 /* Get the way the display table would display it. */
396 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
397
398 if (VECTORP (disp))
399 width = ASIZE (disp);
400 else
401 width = CHAR_WIDTH (c);
402
403 return make_number (width);
404}
405
0168c3d8
KH
406/* Return width of string STR of length LEN when displayed in the
407 current buffer. The width is measured by how many columns it
408 occupies on the screen. If PRECISION > 0, return the width of
409 longest substring that doesn't exceed PRECISION, and set number of
410 characters and bytes of the substring in *NCHARS and *NBYTES
411 respectively. */
412
1889b238 413int
0168c3d8 414c_string_width (str, len, precision, nchars, nbytes)
8f924df7 415 const unsigned char *str;
0168c3d8
KH
416 int precision, *nchars, *nbytes;
417{
418 int i = 0, i_byte = 0;
419 int width = 0;
420 struct Lisp_Char_Table *dp = buffer_display_table ();
421
422 while (i_byte < len)
423 {
424 int bytes, thiswidth;
425 Lisp_Object val;
426 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
427
428 if (dp)
429 {
430 val = DISP_CHAR_VECTOR (dp, c);
431 if (VECTORP (val))
432 thiswidth = XVECTOR (val)->size;
433 else
434 thiswidth = CHAR_WIDTH (c);
435 }
436 else
437 {
438 thiswidth = CHAR_WIDTH (c);
439 }
440
441 if (precision > 0
442 && (width + thiswidth > precision))
443 {
444 *nchars = i;
445 *nbytes = i_byte;
446 return width;
447 }
448 i++;
449 i_byte += bytes;
450 width += thiswidth;
451 }
452
453 if (precision > 0)
454 {
455 *nchars = i;
456 *nbytes = i_byte;
457 }
458
459 return width;
460}
461
1889b238
KH
462/* Return width of string STR of length LEN when displayed in the
463 current buffer. The width is measured by how many columns it
464 occupies on the screen. */
465
466int
467strwidth (str, len)
468 unsigned char *str;
469 int len;
470{
471 return c_string_width (str, len, -1, NULL, NULL);
472}
473
0168c3d8
KH
474/* Return width of Lisp string STRING when displayed in the current
475 buffer. The width is measured by how many columns it occupies on
476 the screen while paying attention to compositions. If PRECISION >
477 0, return the width of longest substring that doesn't exceed
478 PRECISION, and set number of characters and bytes of the substring
479 in *NCHARS and *NBYTES respectively. */
480
481int
482lisp_string_width (string, precision, nchars, nbytes)
483 Lisp_Object string;
484 int precision, *nchars, *nbytes;
485{
8f924df7 486 int len = SCHARS (string);
0aee65b9
KH
487 /* This set multibyte to 0 even if STRING is multibyte when it
488 contains only ascii and eight-bit-graphic, but that's
489 intentional. */
490 int multibyte = len < SBYTES (string);
8f924df7 491 unsigned char *str = SDATA (string);
0168c3d8
KH
492 int i = 0, i_byte = 0;
493 int width = 0;
494 struct Lisp_Char_Table *dp = buffer_display_table ();
495
496 while (i < len)
497 {
498 int chars, bytes, thiswidth;
499 Lisp_Object val;
500 int cmp_id;
f4bc0685 501 EMACS_INT ignore, end;
0168c3d8
KH
502
503 if (find_composition (i, -1, &ignore, &end, &val, string)
504 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
505 >= 0))
506 {
507 thiswidth = composition_table[cmp_id]->width;
508 chars = end - i;
509 bytes = string_char_to_byte (string, end) - i_byte;
510 }
0168c3d8
KH
511 else
512 {
0aee65b9 513 int c;
0168c3d8 514
0aee65b9
KH
515 if (multibyte)
516 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
517 else
518 c = str[i_byte], bytes = 1;
0168c3d8 519 chars = 1;
0aee65b9
KH
520 if (dp)
521 {
522 val = DISP_CHAR_VECTOR (dp, c);
523 if (VECTORP (val))
524 thiswidth = XVECTOR (val)->size;
525 else
526 thiswidth = CHAR_WIDTH (c);
527 }
528 else
529 {
530 thiswidth = CHAR_WIDTH (c);
531 }
0168c3d8
KH
532 }
533
534 if (precision > 0
535 && (width + thiswidth > precision))
536 {
537 *nchars = i;
538 *nbytes = i_byte;
539 return width;
540 }
541 i += chars;
542 i_byte += bytes;
543 width += thiswidth;
544 }
545
546 if (precision > 0)
547 {
548 *nchars = i;
549 *nbytes = i_byte;
550 }
551
552 return width;
553}
554
555DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
556 doc: /* Return width of STRING when displayed in the current buffer.
557Width is measured by how many columns it occupies on the screen.
558When calculating width of a multibyte character in STRING,
559only the base leading-code is considered; the validity of
560the following bytes is not checked. Tabs in STRING are always
add553ac
JB
561taken to occupy `tab-width' columns.
562usage: (string-width STRING) */)
0168c3d8
KH
563 (str)
564 Lisp_Object str;
565{
566 Lisp_Object val;
567
568 CHECK_STRING (str);
569 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
570 return val;
571}
572
573DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
574 doc: /* Return the direction of CHAR.
add553ac
JB
575The returned value is 0 for left-to-right and 1 for right-to-left.
576usage: (char-direction CHAR) */)
0168c3d8
KH
577 (ch)
578 Lisp_Object ch;
579{
580 int c;
581
582 CHECK_CHARACTER (ch);
583 c = XINT (ch);
584 return CHAR_TABLE_REF (Vchar_direction_table, c);
585}
586
0168c3d8
KH
587/* Return the number of characters in the NBYTES bytes at PTR.
588 This works by looking at the contents and checking for multibyte
589 sequences while assuming that there's no invalid sequence.
590 However, if the current buffer has enable-multibyte-characters =
591 nil, we treat each byte as a character. */
592
13818c30 593EMACS_INT
0168c3d8 594chars_in_text (ptr, nbytes)
8f924df7 595 const unsigned char *ptr;
13818c30 596 EMACS_INT nbytes;
0168c3d8
KH
597{
598 /* current_buffer is null at early stages of Emacs initialization. */
599 if (current_buffer == 0
600 || NILP (current_buffer->enable_multibyte_characters))
601 return nbytes;
602
603 return multibyte_chars_in_text (ptr, nbytes);
604}
605
606/* Return the number of characters in the NBYTES bytes at PTR.
607 This works by looking at the contents and checking for multibyte
608 sequences while assuming that there's no invalid sequence. It
609 ignores enable-multibyte-characters. */
610
13818c30 611EMACS_INT
0168c3d8 612multibyte_chars_in_text (ptr, nbytes)
8f924df7 613 const unsigned char *ptr;
13818c30 614 EMACS_INT nbytes;
0168c3d8 615{
8f924df7 616 const unsigned char *endp = ptr + nbytes;
0168c3d8
KH
617 int chars = 0;
618
619 while (ptr < endp)
620 {
621 int len = MULTIBYTE_LENGTH (ptr, endp);
622
623 if (len == 0)
624 abort ();
625 ptr += len;
626 chars++;
627 }
628
629 return chars;
630}
631
632/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
633 characters and bytes in it, and store them in *NCHARS and *NBYTES
634 respectively. On counting bytes, pay attention to that 8-bit
635 characters not constructing a valid multibyte sequence are
636 represented by 2-byte in a multibyte text. */
637
638void
639parse_str_as_multibyte (str, len, nchars, nbytes)
8f924df7 640 const unsigned char *str;
0168c3d8
KH
641 int len, *nchars, *nbytes;
642{
8f924df7 643 const unsigned char *endp = str + len;
0168c3d8
KH
644 int n, chars = 0, bytes = 0;
645
646 if (len >= MAX_MULTIBYTE_LENGTH)
647 {
8f924df7 648 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
649 while (str < adjusted_endp)
650 {
651 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
652 str += n, bytes += n;
653 else
654 str++, bytes += 2;
655 chars++;
656 }
657 }
658 while (str < endp)
659 {
660 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
661 str += n, bytes += n;
662 else
663 str++, bytes += 2;
664 chars++;
665 }
666
667 *nchars = chars;
668 *nbytes = bytes;
669 return;
670}
671
672/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
673 It actually converts only such 8-bit characters that don't contruct
674 a multibyte sequence to multibyte forms of Latin-1 characters. If
675 NCHARS is nonzero, set *NCHARS to the number of characters in the
676 text. It is assured that we can use LEN bytes at STR as a work
677 area and that is enough. Return the number of bytes of the
678 resulting text. */
679
680int
681str_as_multibyte (str, len, nbytes, nchars)
682 unsigned char *str;
683 int len, nbytes, *nchars;
684{
685 unsigned char *p = str, *endp = str + nbytes;
686 unsigned char *to;
687 int chars = 0;
688 int n;
689
690 if (nbytes >= MAX_MULTIBYTE_LENGTH)
691 {
692 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
693 while (p < adjusted_endp
694 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
695 p += n, chars++;
696 }
697 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
698 p += n, chars++;
699 if (nchars)
700 *nchars = chars;
701 if (p == endp)
702 return nbytes;
703
704 to = p;
705 nbytes = endp - p;
706 endp = str + len;
707 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
708 p = endp - nbytes;
709
710 if (nbytes >= MAX_MULTIBYTE_LENGTH)
711 {
712 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
713 while (p < adjusted_endp)
714 {
715 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
716 {
717 while (n--)
718 *to++ = *p++;
719 }
720 else
721 {
722 int c = *p++;
723 c = BYTE8_TO_CHAR (c);
724 to += CHAR_STRING (c, to);
725 }
726 }
727 chars++;
728 }
729 while (p < endp)
730 {
731 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
732 {
733 while (n--)
734 *to++ = *p++;
8f924df7 735 }
0168c3d8
KH
736 else
737 {
738 int c = *p++;
739 c = BYTE8_TO_CHAR (c);
740 to += CHAR_STRING (c, to);
741 }
742 chars++;
743 }
744 if (nchars)
745 *nchars = chars;
746 return (to - str);
747}
748
749/* Parse unibyte string at STR of LEN bytes, and return the number of
750 bytes it may ocupy when converted to multibyte string by
751 `str_to_multibyte'. */
752
753int
754parse_str_to_multibyte (str, len)
755 unsigned char *str;
756 int len;
757{
758 unsigned char *endp = str + len;
759 int bytes;
760
761 for (bytes = 0; str < endp; str++)
762 bytes += (*str < 0x80) ? 1 : 2;
763 return bytes;
764}
765
766
767/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
768 that contains the same single-byte characters. It actually
769 converts all 8-bit characters to multibyte forms. It is assured
770 that we can use LEN bytes at STR as a work area and that is
771 enough. */
772
773int
774str_to_multibyte (str, len, bytes)
775 unsigned char *str;
776 int len, bytes;
777{
778 unsigned char *p = str, *endp = str + bytes;
779 unsigned char *to;
780
781 while (p < endp && *p < 0x80) p++;
782 if (p == endp)
783 return bytes;
784 to = p;
785 bytes = endp - p;
786 endp = str + len;
787 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
788 p = endp - bytes;
8f924df7 789 while (p < endp)
0168c3d8
KH
790 {
791 int c = *p++;
792
793 if (c >= 0x80)
794 c = BYTE8_TO_CHAR (c);
795 to += CHAR_STRING (c, to);
796 }
797 return (to - str);
798}
799
800/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
801 actually converts characters in the range 0x80..0xFF to
802 unibyte. */
803
804int
805str_as_unibyte (str, bytes)
806 unsigned char *str;
807 int bytes;
808{
15843e6f
KH
809 const unsigned char *p = str, *endp = str + bytes;
810 unsigned char *to;
0168c3d8
KH
811 int c, len;
812
813 while (p < endp)
814 {
815 c = *p;
816 len = BYTES_BY_CHAR_HEAD (c);
817 if (CHAR_BYTE8_HEAD_P (c))
818 break;
819 p += len;
820 }
15843e6f 821 to = str + (p - str);
8f924df7 822 while (p < endp)
0168c3d8
KH
823 {
824 c = *p;
825 len = BYTES_BY_CHAR_HEAD (c);
826 if (CHAR_BYTE8_HEAD_P (c))
827 {
828 c = STRING_CHAR_ADVANCE (p);
829 *to++ = CHAR_TO_BYTE8 (c);
830 }
831 else
832 {
833 while (len--) *to++ = *p++;
834 }
835 }
836 return (to - str);
837}
838
839int
840string_count_byte8 (string)
841 Lisp_Object string;
842{
843 int multibyte = STRING_MULTIBYTE (string);
8f924df7
KH
844 int nbytes = SBYTES (string);
845 unsigned char *p = SDATA (string);
0168c3d8
KH
846 unsigned char *pend = p + nbytes;
847 int count = 0;
848 int c, len;
849
850 if (multibyte)
851 while (p < pend)
852 {
853 c = *p;
854 len = BYTES_BY_CHAR_HEAD (c);
855
856 if (CHAR_BYTE8_HEAD_P (c))
857 count++;
858 p += len;
859 }
860 else
861 while (p < pend)
862 {
863 if (*p++ >= 0x80)
864 count++;
865 }
866 return count;
867}
868
869
870Lisp_Object
871string_escape_byte8 (string)
872 Lisp_Object string;
873{
8f924df7
KH
874 int nchars = SCHARS (string);
875 int nbytes = SBYTES (string);
0168c3d8
KH
876 int multibyte = STRING_MULTIBYTE (string);
877 int byte8_count;
15843e6f
KH
878 const unsigned char *src, *src_end;
879 unsigned char *dst;
0168c3d8
KH
880 Lisp_Object val;
881 int c, len;
882
883 if (multibyte && nchars == nbytes)
884 return string;
885
886 byte8_count = string_count_byte8 (string);
887
888 if (byte8_count == 0)
889 return string;
890
891 if (multibyte)
892 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
7b40ebaf 893 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
0168c3d8
KH
894 nbytes + byte8_count * 2);
895 else
896 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
897 val = make_uninit_string (nbytes + byte8_count * 3);
898
8f924df7 899 src = SDATA (string);
0168c3d8 900 src_end = src + nbytes;
8f924df7 901 dst = SDATA (val);
0168c3d8
KH
902 if (multibyte)
903 while (src < src_end)
904 {
905 c = *src;
906 len = BYTES_BY_CHAR_HEAD (c);
907
908 if (CHAR_BYTE8_HEAD_P (c))
909 {
910 c = STRING_CHAR_ADVANCE (src);
911 c = CHAR_TO_BYTE8 (c);
1889b238 912 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
913 dst += 4;
914 }
915 else
916 while (len--) *dst++ = *src++;
917 }
918 else
919 while (src < src_end)
920 {
921 c = *src++;
922 if (c >= 0x80)
923 {
1889b238 924 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
925 dst += 4;
926 }
927 else
928 *dst++ = c;
929 }
930 return val;
931}
932
933\f
8f924df7 934DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 935 doc: /*
d2e83296
DL
936Concatenate all the argument characters and make the result a string.
937usage: (string &rest CHARACTERS) */)
0168c3d8
KH
938 (n, args)
939 int n;
940 Lisp_Object *args;
941{
942 int i;
943 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
944 unsigned char *p = buf;
945 int c;
946
947 for (i = 0; i < n; i++)
948 {
949 CHECK_CHARACTER (args[i]);
950 c = XINT (args[i]);
951 p += CHAR_STRING (c, p);
952 }
953
954 return make_string_from_bytes ((char *) buf, n, p - buf);
955}
956
70b4969d 957DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
958 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
959usage: (unibyte-string &rest BYTES) */)
70b4969d
KH
960 (n, args)
961 int n;
962 Lisp_Object *args;
963{
964 int i;
965 unsigned char *buf = (unsigned char *) alloca (n);
966 unsigned char *p = buf;
967 unsigned c;
968
969 for (i = 0; i < n; i++)
970 {
971 CHECK_NATNUM (args[i]);
972 c = XFASTINT (args[i]);
973 if (c >= 256)
974 args_out_of_range_3 (args[i], make_number (0), make_number (255));
975 *p++ = c;
976 }
977
978 return make_string_from_bytes ((char *) buf, n, p - buf);
979}
980
d0363d44
KH
981DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers,
982 Schar_resolve_modifiers, 1, 1, 0,
983 doc: /* Resolve modifiers in the character CHAR.
984The value is a character with modifiers resolved into the character
985code. Unresolved modifiers are kept in the value.
986usage: (char-resolve-modifers CHAR) */)
987 (character)
988 Lisp_Object character;
989{
990 int c;
991
992 CHECK_NUMBER (character);
993 c = XINT (character);
994 return make_number (char_resolve_modifier_mask (c));
995}
996
0168c3d8
KH
997void
998init_character_once ()
999{
1000}
1001
1002#ifdef emacs
1003
1004void
1005syms_of_character ()
1006{
1007 DEFSYM (Qcharacterp, "characterp");
1008 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1009
1010 staticpro (&Vchar_unify_table);
1011 Vchar_unify_table = Qnil;
1012
1013 defsubr (&Smax_char);
1014 defsubr (&Scharacterp);
1015 defsubr (&Sunibyte_char_to_multibyte);
1016 defsubr (&Smultibyte_char_to_unibyte);
1017 defsubr (&Schar_bytes);
1018 defsubr (&Schar_width);
1019 defsubr (&Sstring_width);
1020 defsubr (&Schar_direction);
0168c3d8 1021 defsubr (&Sstring);
70b4969d 1022 defsubr (&Sunibyte_string);
d0363d44 1023 defsubr (&Schar_resolve_modifiers);
0168c3d8
KH
1024
1025 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1026 doc: /*
68978cf0
DL
1027Vector recording all translation tables ever defined.
1028Each element is a pair (SYMBOL . TABLE) relating the table to the
1029symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1030 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1031
1032 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1033 doc: /*
1034A char-table for characters which invoke auto-filling.
1035Such characters have value t in this table. */);
1036 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1037 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1038 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
1039
1040 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1041 doc: /*
1042A char-table for width (columns) of each character. */);
1043 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1044 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1045 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1046 make_number (4));
0168c3d8
KH
1047
1048 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1049 doc: /* A char-table for direction of each character. */);
1050 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1051
1052 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1053 doc: /* A char-table for each printable character. */);
db6d4189 1054 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1055 Fset_char_table_range (Vprintable_chars,
1056 Fcons (make_number (32), make_number (126)), Qt);
1057 Fset_char_table_range (Vprintable_chars,
1058 Fcons (make_number (160),
1059 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1060
c57f3328
KH
1061 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1062 doc: /* Char table of script symbols.
1063It has one extra slot whose value is a list of script symbols. */);
1064
1065 /* Intern this now in case it isn't already done.
1066 Setting this variable twice is harmless.
1067 But don't staticpro it here--that is done in alloc.c. */
1068 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1069 DEFSYM (Qchar_script_table, "char-script-table");
1070 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1071 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352
KH
1072
1073 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1074 doc: /* Alist of scripts vs the representative characters. */);
1075 Vscript_representative_chars = Qnil;
0168c3d8
KH
1076}
1077
1078#endif /* emacs */
fbaf0946
MB
1079
1080/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1081 (do not change this comment) */