(Vscript_representative_chars): New variable.
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
cca4e3b0 4 Copyright (C) 2001, 2005 Free Software Foundation, Inc.
8f924df7 5 Copyright (C) 2003
0168c3d8
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
15
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
25
26/* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29#ifdef emacs
30#include <config.h>
31#endif
32
33#include <stdio.h>
34
35#ifdef emacs
36
37#include <sys/types.h>
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
53/* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55Lisp_Object Vtranslation_table_vector;
56
57/* A char-table for characters which may invoke auto-filling. */
58Lisp_Object Vauto_fill_chars;
59
60Lisp_Object Qauto_fill_chars;
61
33f91981
KH
62/* Char-table of information about which character to unify to which
63 Unicode character. */
0168c3d8
KH
64Lisp_Object Vchar_unify_table;
65
66/* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68Lisp_Object Vprintable_chars;
69
70/* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72Lisp_Object Vchar_width_table;
73
74/* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76Lisp_Object Vchar_direction_table;
77
8973478b 78/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 79unsigned char *_fetch_multibyte_char_p;
0168c3d8 80
c57f3328
KH
81/* Char table of scripts. */
82Lisp_Object Vchar_script_table;
83
c7e14352
KH
84/* Alist of scripts vs representative characters. */
85Lisp_Object Vscript_representative_chars;
86
c57f3328
KH
87static Lisp_Object Qchar_script_table;
88
b672c5ae
KH
89/* Mapping table from unibyte chars to multibyte chars. */
90int unibyte_to_multibyte_table[256];
15843e6f 91
90c9d035
KH
92/* Nth element is 1 iff unibyte char N can be mapped to a multibyte
93 char. */
94char unibyte_has_multibyte_table[256];
95
0168c3d8
KH
96\f
97
33f91981
KH
98/* Store multibyte form of character C at P. If C has modifier bits,
99 handle them appropriately. */
100
0168c3d8 101int
e3d8eb8c 102char_string (c, p)
0168c3d8 103 int c;
1889b238 104 unsigned char *p;
0168c3d8
KH
105{
106 int bytes;
107
e3d8eb8c
KH
108 if (c & CHAR_MODIFIER_MASK)
109 {
cc6dfd2a
KH
110 /* As an non-ASCII character can't have modifier bits, we just
111 ignore the bits. */
112 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
e3d8eb8c
KH
113 {
114 /* For Meta, Shift, and Control modifiers, we need special care. */
115 if (c & CHAR_META)
116 {
117 /* Move the meta bit to the right place for a string. */
118 c = (c & ~CHAR_META) | 0x80;
119 }
120 if (c & CHAR_SHIFT)
121 {
122 /* Shift modifier is valid only with [A-Za-z]. */
123 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
124 c &= ~CHAR_SHIFT;
125 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
126 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
127 }
128 if (c & CHAR_CTL)
129 {
130 /* Simulate the code in lread.c. */
131 /* Allow `\C- ' and `\C-?'. */
132 if (c == (CHAR_CTL | ' '))
133 c = 0;
134 else if (c == (CHAR_CTL | '?'))
135 c = 127;
136 /* ASCII control chars are made from letters (both cases),
137 as well as the non-letters within 0100...0137. */
138 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
139 c &= (037 | (~0177 & ~CHAR_CTL));
140 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
141 c &= (037 | (~0177 & ~CHAR_CTL));
142 }
143 }
144
145 /* If C still has any modifier bits, just ignore it. */
146 c &= ~CHAR_MODIFIER_MASK;
147 }
148
0168c3d8
KH
149 MAYBE_UNIFY_CHAR (c);
150
e3d8eb8c 151 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
152 {
153 bytes = CHAR_STRING (c, p);
154 }
155 else if (c <= MAX_4_BYTE_CHAR)
156 {
157 p[0] = (0xF0 | (c >> 18));
158 p[1] = (0x80 | ((c >> 12) & 0x3F));
159 p[2] = (0x80 | ((c >> 6) & 0x3F));
160 p[3] = (0x80 | (c & 0x3F));
161 bytes = 4;
162 }
e3d8eb8c 163 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
164 {
165 p[0] = 0xF8;
166 p[1] = (0x80 | ((c >> 18) & 0x0F));
167 p[2] = (0x80 | ((c >> 12) & 0x3F));
168 p[3] = (0x80 | ((c >> 6) & 0x3F));
169 p[4] = (0x80 | (c & 0x3F));
170 bytes = 5;
171 }
e3d8eb8c
KH
172 else
173 {
174 c = CHAR_TO_BYTE8 (c);
175 bytes = BYTE8_STRING (c, p);
176 }
1889b238 177
0168c3d8
KH
178 return bytes;
179}
180
181
33f91981
KH
182/* Return a character whose multibyte form is at P. Set LEN is not
183 NULL, it must be a pointer to integer. In that case, set *LEN to
184 the byte length of the multibyte form. If ADVANCED is not NULL, is
185 must be a pointer to unsigned char. In that case, set *ADVANCED to
186 the ending address (i.e. the starting address of the next
187 character) of the multibyte form. */
188
0168c3d8 189int
e3d8eb8c 190string_char (p, advanced, len)
15843e6f
KH
191 const unsigned char *p;
192 const unsigned char **advanced;
0168c3d8
KH
193 int *len;
194{
1889b238 195 int c;
15843e6f 196 const unsigned char *saved_p = p;
0168c3d8
KH
197
198 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
199 {
200 c = STRING_CHAR_ADVANCE (p);
201 }
202 else if (! (*p & 0x08))
203 {
204 c = ((((p)[0] & 0xF) << 18)
205 | (((p)[1] & 0x3F) << 12)
206 | (((p)[2] & 0x3F) << 6)
207 | ((p)[3] & 0x3F));
208 p += 4;
209 }
210 else
211 {
212 c = ((((p)[1] & 0x3F) << 18)
213 | (((p)[2] & 0x3F) << 12)
214 | (((p)[3] & 0x3F) << 6)
215 | ((p)[4] & 0x3F));
216 p += 5;
217 }
218
219 MAYBE_UNIFY_CHAR (c);
220
221 if (len)
222 *len = p - saved_p;
223 if (advanced)
224 *advanced = p;
225 return c;
226}
227
228
229/* Translate character C by translation table TABLE. If C is
230 negative, translate a character specified by CHARSET and CODE. If
231 no translation is found in TABLE, return the untranslated
10453be9
KH
232 character. If TABLE is a list, elements are char tables. In this
233 case, translace C by all tables. */
0168c3d8
KH
234
235int
236translate_char (table, c)
237 Lisp_Object table;
238 int c;
239{
10453be9
KH
240 if (CHAR_TABLE_P (table))
241 {
242 Lisp_Object ch;
243
244 ch = CHAR_TABLE_REF (table, c);
245 if (CHARACTERP (ch))
246 c = XINT (ch);
247 }
248 else
249 {
250 for (; CONSP (table); table = XCDR (table))
251 c = translate_char (XCAR (table), c);
252 }
253 return c;
0168c3d8
KH
254}
255
0168c3d8 256/* Convert the multibyte character C to unibyte 8-bit character based
ac86488b
KH
257 on the current value of charset_unibyte. If dimension of
258 charset_unibyte is more than one, return (C & 0xFF).
0168c3d8
KH
259
260 The argument REV_TBL is now ignored. It will be removed in the
261 future. */
262
263int
264multibyte_char_to_unibyte (c, rev_tbl)
265 int c;
266 Lisp_Object rev_tbl;
267{
b672c5ae
KH
268 struct charset *charset;
269 unsigned c1;
0168c3d8 270
b672c5ae
KH
271 if (CHAR_BYTE8_P (c))
272 return CHAR_TO_BYTE8 (c);
273 charset = CHARSET_FROM_ID (charset_unibyte);
274 c1 = ENCODE_CHAR (charset, c);
0168c3d8
KH
275 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
276}
277
278
279DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
280 doc: /* Return non-nil if OBJECT is a character. */)
281 (object, ignore)
282 Lisp_Object object, ignore;
283{
284 return (CHARACTERP (object) ? Qt : Qnil);
285}
286
287DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
288 doc: /* Return the character of the maximum code. */)
289 ()
290{
291 return make_number (MAX_CHAR);
292}
293
294DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
295 Sunibyte_char_to_multibyte, 1, 1, 0,
296 doc: /* Convert the unibyte character CH to multibyte character.
297The multibyte character is a result of decoding CH by
ed1d5bc0 298the current unibyte charset (see `unibyte-charset'). */)
0168c3d8
KH
299 (ch)
300 Lisp_Object ch;
301{
302 int c;
303 struct charset *charset;
304
305 CHECK_CHARACTER (ch);
306 c = XFASTINT (ch);
307 if (c >= 0400)
308 error ("Invalid unibyte character: %d", c);
ac86488b 309 charset = CHARSET_FROM_ID (charset_unibyte);
0168c3d8
KH
310 c = DECODE_CHAR (charset, c);
311 if (c < 0)
3c5a53bd 312 c = BYTE8_TO_CHAR (XFASTINT (ch));
0168c3d8
KH
313 return make_number (c);
314}
315
316DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
317 Smultibyte_char_to_unibyte, 1, 1, 0,
318 doc: /* Convert the multibyte character CH to unibyte character.\n\
319The unibyte character is a result of encoding CH by
320the current primary charset (value of `charset-primary'). */)
321 (ch)
322 Lisp_Object ch;
323{
324 int c;
0168c3d8
KH
325
326 CHECK_CHARACTER (ch);
327 c = XFASTINT (ch);
3c5a53bd
KH
328 c = CHAR_TO_BYTE8 (c);
329 return make_number (c);
0168c3d8
KH
330}
331
332DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
333 doc: /* Return 1 regardless of the argument CHAR.
334This is now an obsolete function. We keep it just for backward compatibility. */)
335 (ch)
336 Lisp_Object ch;
337{
338 CHECK_CHARACTER (ch);
339 return make_number (1);
340}
341
342DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
343 doc: /* Return width of CHAR when displayed in the current buffer.
344The width is measured by how many columns it occupies on the screen.
345Tab is taken to occupy `tab-width' columns. */)
346 (ch)
347 Lisp_Object ch;
348{
349 Lisp_Object disp;
350 int c, width;
351 struct Lisp_Char_Table *dp = buffer_display_table ();
352
353 CHECK_CHARACTER (ch);
354 c = XINT (ch);
355
356 /* Get the way the display table would display it. */
357 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
358
359 if (VECTORP (disp))
360 width = ASIZE (disp);
361 else
362 width = CHAR_WIDTH (c);
363
364 return make_number (width);
365}
366
0168c3d8
KH
367/* Return width of string STR of length LEN when displayed in the
368 current buffer. The width is measured by how many columns it
369 occupies on the screen. If PRECISION > 0, return the width of
370 longest substring that doesn't exceed PRECISION, and set number of
371 characters and bytes of the substring in *NCHARS and *NBYTES
372 respectively. */
373
1889b238 374int
0168c3d8 375c_string_width (str, len, precision, nchars, nbytes)
8f924df7 376 const unsigned char *str;
0168c3d8
KH
377 int precision, *nchars, *nbytes;
378{
379 int i = 0, i_byte = 0;
380 int width = 0;
381 struct Lisp_Char_Table *dp = buffer_display_table ();
382
383 while (i_byte < len)
384 {
385 int bytes, thiswidth;
386 Lisp_Object val;
387 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
388
389 if (dp)
390 {
391 val = DISP_CHAR_VECTOR (dp, c);
392 if (VECTORP (val))
393 thiswidth = XVECTOR (val)->size;
394 else
395 thiswidth = CHAR_WIDTH (c);
396 }
397 else
398 {
399 thiswidth = CHAR_WIDTH (c);
400 }
401
402 if (precision > 0
403 && (width + thiswidth > precision))
404 {
405 *nchars = i;
406 *nbytes = i_byte;
407 return width;
408 }
409 i++;
410 i_byte += bytes;
411 width += thiswidth;
412 }
413
414 if (precision > 0)
415 {
416 *nchars = i;
417 *nbytes = i_byte;
418 }
419
420 return width;
421}
422
1889b238
KH
423/* Return width of string STR of length LEN when displayed in the
424 current buffer. The width is measured by how many columns it
425 occupies on the screen. */
426
427int
428strwidth (str, len)
429 unsigned char *str;
430 int len;
431{
432 return c_string_width (str, len, -1, NULL, NULL);
433}
434
0168c3d8
KH
435/* Return width of Lisp string STRING when displayed in the current
436 buffer. The width is measured by how many columns it occupies on
437 the screen while paying attention to compositions. If PRECISION >
438 0, return the width of longest substring that doesn't exceed
439 PRECISION, and set number of characters and bytes of the substring
440 in *NCHARS and *NBYTES respectively. */
441
442int
443lisp_string_width (string, precision, nchars, nbytes)
444 Lisp_Object string;
445 int precision, *nchars, *nbytes;
446{
8f924df7 447 int len = SCHARS (string);
0aee65b9
KH
448 /* This set multibyte to 0 even if STRING is multibyte when it
449 contains only ascii and eight-bit-graphic, but that's
450 intentional. */
451 int multibyte = len < SBYTES (string);
8f924df7 452 unsigned char *str = SDATA (string);
0168c3d8
KH
453 int i = 0, i_byte = 0;
454 int width = 0;
455 struct Lisp_Char_Table *dp = buffer_display_table ();
456
457 while (i < len)
458 {
459 int chars, bytes, thiswidth;
460 Lisp_Object val;
461 int cmp_id;
f4bc0685 462 EMACS_INT ignore, end;
0168c3d8
KH
463
464 if (find_composition (i, -1, &ignore, &end, &val, string)
465 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
466 >= 0))
467 {
468 thiswidth = composition_table[cmp_id]->width;
469 chars = end - i;
470 bytes = string_char_to_byte (string, end) - i_byte;
471 }
0168c3d8
KH
472 else
473 {
0aee65b9 474 int c;
0168c3d8 475
0aee65b9
KH
476 if (multibyte)
477 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
478 else
479 c = str[i_byte], bytes = 1;
0168c3d8 480 chars = 1;
0aee65b9
KH
481 if (dp)
482 {
483 val = DISP_CHAR_VECTOR (dp, c);
484 if (VECTORP (val))
485 thiswidth = XVECTOR (val)->size;
486 else
487 thiswidth = CHAR_WIDTH (c);
488 }
489 else
490 {
491 thiswidth = CHAR_WIDTH (c);
492 }
0168c3d8
KH
493 }
494
495 if (precision > 0
496 && (width + thiswidth > precision))
497 {
498 *nchars = i;
499 *nbytes = i_byte;
500 return width;
501 }
502 i += chars;
503 i_byte += bytes;
504 width += thiswidth;
505 }
506
507 if (precision > 0)
508 {
509 *nchars = i;
510 *nbytes = i_byte;
511 }
512
513 return width;
514}
515
516DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
517 doc: /* Return width of STRING when displayed in the current buffer.
518Width is measured by how many columns it occupies on the screen.
519When calculating width of a multibyte character in STRING,
520only the base leading-code is considered; the validity of
521the following bytes is not checked. Tabs in STRING are always
522taken to occupy `tab-width' columns. */)
523 (str)
524 Lisp_Object str;
525{
526 Lisp_Object val;
527
528 CHECK_STRING (str);
529 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
530 return val;
531}
532
533DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
534 doc: /* Return the direction of CHAR.
535The returned value is 0 for left-to-right and 1 for right-to-left. */)
536 (ch)
537 Lisp_Object ch;
538{
539 int c;
540
541 CHECK_CHARACTER (ch);
542 c = XINT (ch);
543 return CHAR_TABLE_REF (Vchar_direction_table, c);
544}
545
0168c3d8
KH
546/* Return the number of characters in the NBYTES bytes at PTR.
547 This works by looking at the contents and checking for multibyte
548 sequences while assuming that there's no invalid sequence.
549 However, if the current buffer has enable-multibyte-characters =
550 nil, we treat each byte as a character. */
551
552int
553chars_in_text (ptr, nbytes)
8f924df7 554 const unsigned char *ptr;
0168c3d8
KH
555 int nbytes;
556{
557 /* current_buffer is null at early stages of Emacs initialization. */
558 if (current_buffer == 0
559 || NILP (current_buffer->enable_multibyte_characters))
560 return nbytes;
561
562 return multibyte_chars_in_text (ptr, nbytes);
563}
564
565/* Return the number of characters in the NBYTES bytes at PTR.
566 This works by looking at the contents and checking for multibyte
567 sequences while assuming that there's no invalid sequence. It
568 ignores enable-multibyte-characters. */
569
570int
571multibyte_chars_in_text (ptr, nbytes)
8f924df7 572 const unsigned char *ptr;
0168c3d8
KH
573 int nbytes;
574{
8f924df7 575 const unsigned char *endp = ptr + nbytes;
0168c3d8
KH
576 int chars = 0;
577
578 while (ptr < endp)
579 {
580 int len = MULTIBYTE_LENGTH (ptr, endp);
581
582 if (len == 0)
583 abort ();
584 ptr += len;
585 chars++;
586 }
587
588 return chars;
589}
590
591/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
592 characters and bytes in it, and store them in *NCHARS and *NBYTES
593 respectively. On counting bytes, pay attention to that 8-bit
594 characters not constructing a valid multibyte sequence are
595 represented by 2-byte in a multibyte text. */
596
597void
598parse_str_as_multibyte (str, len, nchars, nbytes)
8f924df7 599 const unsigned char *str;
0168c3d8
KH
600 int len, *nchars, *nbytes;
601{
8f924df7 602 const unsigned char *endp = str + len;
0168c3d8
KH
603 int n, chars = 0, bytes = 0;
604
605 if (len >= MAX_MULTIBYTE_LENGTH)
606 {
8f924df7 607 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
608 while (str < adjusted_endp)
609 {
610 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
611 str += n, bytes += n;
612 else
613 str++, bytes += 2;
614 chars++;
615 }
616 }
617 while (str < endp)
618 {
619 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
620 str += n, bytes += n;
621 else
622 str++, bytes += 2;
623 chars++;
624 }
625
626 *nchars = chars;
627 *nbytes = bytes;
628 return;
629}
630
631/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
632 It actually converts only such 8-bit characters that don't contruct
633 a multibyte sequence to multibyte forms of Latin-1 characters. If
634 NCHARS is nonzero, set *NCHARS to the number of characters in the
635 text. It is assured that we can use LEN bytes at STR as a work
636 area and that is enough. Return the number of bytes of the
637 resulting text. */
638
639int
640str_as_multibyte (str, len, nbytes, nchars)
641 unsigned char *str;
642 int len, nbytes, *nchars;
643{
644 unsigned char *p = str, *endp = str + nbytes;
645 unsigned char *to;
646 int chars = 0;
647 int n;
648
649 if (nbytes >= MAX_MULTIBYTE_LENGTH)
650 {
651 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
652 while (p < adjusted_endp
653 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
654 p += n, chars++;
655 }
656 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
657 p += n, chars++;
658 if (nchars)
659 *nchars = chars;
660 if (p == endp)
661 return nbytes;
662
663 to = p;
664 nbytes = endp - p;
665 endp = str + len;
666 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
667 p = endp - nbytes;
668
669 if (nbytes >= MAX_MULTIBYTE_LENGTH)
670 {
671 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
672 while (p < adjusted_endp)
673 {
674 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
675 {
676 while (n--)
677 *to++ = *p++;
678 }
679 else
680 {
681 int c = *p++;
682 c = BYTE8_TO_CHAR (c);
683 to += CHAR_STRING (c, to);
684 }
685 }
686 chars++;
687 }
688 while (p < endp)
689 {
690 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
691 {
692 while (n--)
693 *to++ = *p++;
8f924df7 694 }
0168c3d8
KH
695 else
696 {
697 int c = *p++;
698 c = BYTE8_TO_CHAR (c);
699 to += CHAR_STRING (c, to);
700 }
701 chars++;
702 }
703 if (nchars)
704 *nchars = chars;
705 return (to - str);
706}
707
708/* Parse unibyte string at STR of LEN bytes, and return the number of
709 bytes it may ocupy when converted to multibyte string by
710 `str_to_multibyte'. */
711
712int
713parse_str_to_multibyte (str, len)
714 unsigned char *str;
715 int len;
716{
717 unsigned char *endp = str + len;
718 int bytes;
719
720 for (bytes = 0; str < endp; str++)
721 bytes += (*str < 0x80) ? 1 : 2;
722 return bytes;
723}
724
725
726/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
727 that contains the same single-byte characters. It actually
728 converts all 8-bit characters to multibyte forms. It is assured
729 that we can use LEN bytes at STR as a work area and that is
730 enough. */
731
732int
733str_to_multibyte (str, len, bytes)
734 unsigned char *str;
735 int len, bytes;
736{
737 unsigned char *p = str, *endp = str + bytes;
738 unsigned char *to;
739
740 while (p < endp && *p < 0x80) p++;
741 if (p == endp)
742 return bytes;
743 to = p;
744 bytes = endp - p;
745 endp = str + len;
746 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
747 p = endp - bytes;
8f924df7 748 while (p < endp)
0168c3d8
KH
749 {
750 int c = *p++;
751
752 if (c >= 0x80)
753 c = BYTE8_TO_CHAR (c);
754 to += CHAR_STRING (c, to);
755 }
756 return (to - str);
757}
758
759/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
760 actually converts characters in the range 0x80..0xFF to
761 unibyte. */
762
763int
764str_as_unibyte (str, bytes)
765 unsigned char *str;
766 int bytes;
767{
15843e6f
KH
768 const unsigned char *p = str, *endp = str + bytes;
769 unsigned char *to;
0168c3d8
KH
770 int c, len;
771
772 while (p < endp)
773 {
774 c = *p;
775 len = BYTES_BY_CHAR_HEAD (c);
776 if (CHAR_BYTE8_HEAD_P (c))
777 break;
778 p += len;
779 }
15843e6f 780 to = str + (p - str);
8f924df7 781 while (p < endp)
0168c3d8
KH
782 {
783 c = *p;
784 len = BYTES_BY_CHAR_HEAD (c);
785 if (CHAR_BYTE8_HEAD_P (c))
786 {
787 c = STRING_CHAR_ADVANCE (p);
788 *to++ = CHAR_TO_BYTE8 (c);
789 }
790 else
791 {
792 while (len--) *to++ = *p++;
793 }
794 }
795 return (to - str);
796}
797
798int
799string_count_byte8 (string)
800 Lisp_Object string;
801{
802 int multibyte = STRING_MULTIBYTE (string);
8f924df7
KH
803 int nbytes = SBYTES (string);
804 unsigned char *p = SDATA (string);
0168c3d8
KH
805 unsigned char *pend = p + nbytes;
806 int count = 0;
807 int c, len;
808
809 if (multibyte)
810 while (p < pend)
811 {
812 c = *p;
813 len = BYTES_BY_CHAR_HEAD (c);
814
815 if (CHAR_BYTE8_HEAD_P (c))
816 count++;
817 p += len;
818 }
819 else
820 while (p < pend)
821 {
822 if (*p++ >= 0x80)
823 count++;
824 }
825 return count;
826}
827
828
829Lisp_Object
830string_escape_byte8 (string)
831 Lisp_Object string;
832{
8f924df7
KH
833 int nchars = SCHARS (string);
834 int nbytes = SBYTES (string);
0168c3d8
KH
835 int multibyte = STRING_MULTIBYTE (string);
836 int byte8_count;
15843e6f
KH
837 const unsigned char *src, *src_end;
838 unsigned char *dst;
0168c3d8
KH
839 Lisp_Object val;
840 int c, len;
841
842 if (multibyte && nchars == nbytes)
843 return string;
844
845 byte8_count = string_count_byte8 (string);
846
847 if (byte8_count == 0)
848 return string;
849
850 if (multibyte)
851 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
7b40ebaf 852 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
0168c3d8
KH
853 nbytes + byte8_count * 2);
854 else
855 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
856 val = make_uninit_string (nbytes + byte8_count * 3);
857
8f924df7 858 src = SDATA (string);
0168c3d8 859 src_end = src + nbytes;
8f924df7 860 dst = SDATA (val);
0168c3d8
KH
861 if (multibyte)
862 while (src < src_end)
863 {
864 c = *src;
865 len = BYTES_BY_CHAR_HEAD (c);
866
867 if (CHAR_BYTE8_HEAD_P (c))
868 {
869 c = STRING_CHAR_ADVANCE (src);
870 c = CHAR_TO_BYTE8 (c);
1889b238 871 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
872 dst += 4;
873 }
874 else
875 while (len--) *dst++ = *src++;
876 }
877 else
878 while (src < src_end)
879 {
880 c = *src++;
881 if (c >= 0x80)
882 {
1889b238 883 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
884 dst += 4;
885 }
886 else
887 *dst++ = c;
888 }
889 return val;
890}
891
892\f
8f924df7 893DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 894 doc: /*
d2e83296
DL
895Concatenate all the argument characters and make the result a string.
896usage: (string &rest CHARACTERS) */)
0168c3d8
KH
897 (n, args)
898 int n;
899 Lisp_Object *args;
900{
901 int i;
902 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
903 unsigned char *p = buf;
904 int c;
905
906 for (i = 0; i < n; i++)
907 {
908 CHECK_CHARACTER (args[i]);
909 c = XINT (args[i]);
910 p += CHAR_STRING (c, p);
911 }
912
913 return make_string_from_bytes ((char *) buf, n, p - buf);
914}
915
916void
917init_character_once ()
918{
919}
920
921#ifdef emacs
922
923void
924syms_of_character ()
925{
926 DEFSYM (Qcharacterp, "characterp");
927 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
928
929 staticpro (&Vchar_unify_table);
930 Vchar_unify_table = Qnil;
931
932 defsubr (&Smax_char);
933 defsubr (&Scharacterp);
934 defsubr (&Sunibyte_char_to_multibyte);
935 defsubr (&Smultibyte_char_to_unibyte);
936 defsubr (&Schar_bytes);
937 defsubr (&Schar_width);
938 defsubr (&Sstring_width);
939 defsubr (&Schar_direction);
0168c3d8
KH
940 defsubr (&Sstring);
941
942 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
943 doc: /*
68978cf0
DL
944Vector recording all translation tables ever defined.
945Each element is a pair (SYMBOL . TABLE) relating the table to the
946symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
947 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
948
949 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
950 doc: /*
951A char-table for characters which invoke auto-filling.
952Such characters have value t in this table. */);
953 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
954 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
955 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
956
957 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
958 doc: /*
959A char-table for width (columns) of each character. */);
960 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
961 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
962 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
963 make_number (4));
0168c3d8
KH
964
965 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
966 doc: /* A char-table for direction of each character. */);
967 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
968
969 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
970 doc: /* A char-table for each printable character. */);
db6d4189 971 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
972 Fset_char_table_range (Vprintable_chars,
973 Fcons (make_number (32), make_number (126)), Qt);
974 Fset_char_table_range (Vprintable_chars,
975 Fcons (make_number (160),
976 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 977
c57f3328
KH
978 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
979 doc: /* Char table of script symbols.
980It has one extra slot whose value is a list of script symbols. */);
981
982 /* Intern this now in case it isn't already done.
983 Setting this variable twice is harmless.
984 But don't staticpro it here--that is done in alloc.c. */
985 Qchar_table_extra_slots = intern ("char-table-extra-slots");
986 DEFSYM (Qchar_script_table, "char-script-table");
987 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
988 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352
KH
989
990 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
991 doc: /* Alist of scripts vs the representative characters. */);
992 Vscript_representative_chars = Qnil;
0168c3d8
KH
993}
994
995#endif /* emacs */
fbaf0946
MB
996
997/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
998 (do not change this comment) */