*** empty log message ***
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
0168c3d8 4 Copyright (C) 2001 Free Software Foundation, Inc.
8f924df7 5 Copyright (C) 2003
0168c3d8
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
15
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
25
26/* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29#ifdef emacs
30#include <config.h>
31#endif
32
33#include <stdio.h>
34
35#ifdef emacs
36
37#include <sys/types.h>
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
53/* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55Lisp_Object Vtranslation_table_vector;
56
57/* A char-table for characters which may invoke auto-filling. */
58Lisp_Object Vauto_fill_chars;
59
60Lisp_Object Qauto_fill_chars;
61
33f91981
KH
62/* Char-table of information about which character to unify to which
63 Unicode character. */
0168c3d8
KH
64Lisp_Object Vchar_unify_table;
65
66/* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68Lisp_Object Vprintable_chars;
69
70/* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72Lisp_Object Vchar_width_table;
73
74/* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76Lisp_Object Vchar_direction_table;
77
8973478b 78/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 79unsigned char *_fetch_multibyte_char_p;
0168c3d8 80
c57f3328
KH
81/* Char table of scripts. */
82Lisp_Object Vchar_script_table;
83
84static Lisp_Object Qchar_script_table;
85
b672c5ae
KH
86/* Mapping table from unibyte chars to multibyte chars. */
87int unibyte_to_multibyte_table[256];
15843e6f 88
0168c3d8
KH
89\f
90
33f91981
KH
91/* Store multibyte form of character C at P. If C has modifier bits,
92 handle them appropriately. */
93
0168c3d8 94int
e3d8eb8c 95char_string (c, p)
0168c3d8 96 int c;
1889b238 97 unsigned char *p;
0168c3d8
KH
98{
99 int bytes;
100
e3d8eb8c
KH
101 if (c & CHAR_MODIFIER_MASK)
102 {
cc6dfd2a
KH
103 /* As an non-ASCII character can't have modifier bits, we just
104 ignore the bits. */
105 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
e3d8eb8c
KH
106 {
107 /* For Meta, Shift, and Control modifiers, we need special care. */
108 if (c & CHAR_META)
109 {
110 /* Move the meta bit to the right place for a string. */
111 c = (c & ~CHAR_META) | 0x80;
112 }
113 if (c & CHAR_SHIFT)
114 {
115 /* Shift modifier is valid only with [A-Za-z]. */
116 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
117 c &= ~CHAR_SHIFT;
118 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
119 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
120 }
121 if (c & CHAR_CTL)
122 {
123 /* Simulate the code in lread.c. */
124 /* Allow `\C- ' and `\C-?'. */
125 if (c == (CHAR_CTL | ' '))
126 c = 0;
127 else if (c == (CHAR_CTL | '?'))
128 c = 127;
129 /* ASCII control chars are made from letters (both cases),
130 as well as the non-letters within 0100...0137. */
131 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
132 c &= (037 | (~0177 & ~CHAR_CTL));
133 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
134 c &= (037 | (~0177 & ~CHAR_CTL));
135 }
136 }
137
138 /* If C still has any modifier bits, just ignore it. */
139 c &= ~CHAR_MODIFIER_MASK;
140 }
141
0168c3d8
KH
142 MAYBE_UNIFY_CHAR (c);
143
e3d8eb8c 144 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
145 {
146 bytes = CHAR_STRING (c, p);
147 }
148 else if (c <= MAX_4_BYTE_CHAR)
149 {
150 p[0] = (0xF0 | (c >> 18));
151 p[1] = (0x80 | ((c >> 12) & 0x3F));
152 p[2] = (0x80 | ((c >> 6) & 0x3F));
153 p[3] = (0x80 | (c & 0x3F));
154 bytes = 4;
155 }
e3d8eb8c 156 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
157 {
158 p[0] = 0xF8;
159 p[1] = (0x80 | ((c >> 18) & 0x0F));
160 p[2] = (0x80 | ((c >> 12) & 0x3F));
161 p[3] = (0x80 | ((c >> 6) & 0x3F));
162 p[4] = (0x80 | (c & 0x3F));
163 bytes = 5;
164 }
e3d8eb8c
KH
165 else
166 {
167 c = CHAR_TO_BYTE8 (c);
168 bytes = BYTE8_STRING (c, p);
169 }
1889b238 170
0168c3d8
KH
171 return bytes;
172}
173
174
33f91981
KH
175/* Return a character whose multibyte form is at P. Set LEN is not
176 NULL, it must be a pointer to integer. In that case, set *LEN to
177 the byte length of the multibyte form. If ADVANCED is not NULL, is
178 must be a pointer to unsigned char. In that case, set *ADVANCED to
179 the ending address (i.e. the starting address of the next
180 character) of the multibyte form. */
181
0168c3d8 182int
e3d8eb8c 183string_char (p, advanced, len)
15843e6f
KH
184 const unsigned char *p;
185 const unsigned char **advanced;
0168c3d8
KH
186 int *len;
187{
1889b238 188 int c;
15843e6f 189 const unsigned char *saved_p = p;
0168c3d8
KH
190
191 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
192 {
193 c = STRING_CHAR_ADVANCE (p);
194 }
195 else if (! (*p & 0x08))
196 {
197 c = ((((p)[0] & 0xF) << 18)
198 | (((p)[1] & 0x3F) << 12)
199 | (((p)[2] & 0x3F) << 6)
200 | ((p)[3] & 0x3F));
201 p += 4;
202 }
203 else
204 {
205 c = ((((p)[1] & 0x3F) << 18)
206 | (((p)[2] & 0x3F) << 12)
207 | (((p)[3] & 0x3F) << 6)
208 | ((p)[4] & 0x3F));
209 p += 5;
210 }
211
212 MAYBE_UNIFY_CHAR (c);
213
214 if (len)
215 *len = p - saved_p;
216 if (advanced)
217 *advanced = p;
218 return c;
219}
220
221
222/* Translate character C by translation table TABLE. If C is
223 negative, translate a character specified by CHARSET and CODE. If
224 no translation is found in TABLE, return the untranslated
10453be9
KH
225 character. If TABLE is a list, elements are char tables. In this
226 case, translace C by all tables. */
0168c3d8
KH
227
228int
229translate_char (table, c)
230 Lisp_Object table;
231 int c;
232{
10453be9
KH
233 if (CHAR_TABLE_P (table))
234 {
235 Lisp_Object ch;
236
237 ch = CHAR_TABLE_REF (table, c);
238 if (CHARACTERP (ch))
239 c = XINT (ch);
240 }
241 else
242 {
243 for (; CONSP (table); table = XCDR (table))
244 c = translate_char (XCAR (table), c);
245 }
246 return c;
0168c3d8
KH
247}
248
0168c3d8 249/* Convert the multibyte character C to unibyte 8-bit character based
ac86488b
KH
250 on the current value of charset_unibyte. If dimension of
251 charset_unibyte is more than one, return (C & 0xFF).
0168c3d8
KH
252
253 The argument REV_TBL is now ignored. It will be removed in the
254 future. */
255
256int
257multibyte_char_to_unibyte (c, rev_tbl)
258 int c;
259 Lisp_Object rev_tbl;
260{
b672c5ae
KH
261 struct charset *charset;
262 unsigned c1;
0168c3d8 263
b672c5ae
KH
264 if (CHAR_BYTE8_P (c))
265 return CHAR_TO_BYTE8 (c);
266 charset = CHARSET_FROM_ID (charset_unibyte);
267 c1 = ENCODE_CHAR (charset, c);
0168c3d8
KH
268 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
269}
270
271
272DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
273 doc: /* Return non-nil if OBJECT is a character. */)
274 (object, ignore)
275 Lisp_Object object, ignore;
276{
277 return (CHARACTERP (object) ? Qt : Qnil);
278}
279
280DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
281 doc: /* Return the character of the maximum code. */)
282 ()
283{
284 return make_number (MAX_CHAR);
285}
286
287DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
288 Sunibyte_char_to_multibyte, 1, 1, 0,
289 doc: /* Convert the unibyte character CH to multibyte character.
290The multibyte character is a result of decoding CH by
ed1d5bc0 291the current unibyte charset (see `unibyte-charset'). */)
0168c3d8
KH
292 (ch)
293 Lisp_Object ch;
294{
295 int c;
296 struct charset *charset;
297
298 CHECK_CHARACTER (ch);
299 c = XFASTINT (ch);
300 if (c >= 0400)
301 error ("Invalid unibyte character: %d", c);
ac86488b 302 charset = CHARSET_FROM_ID (charset_unibyte);
0168c3d8
KH
303 c = DECODE_CHAR (charset, c);
304 if (c < 0)
3c5a53bd 305 c = BYTE8_TO_CHAR (XFASTINT (ch));
0168c3d8
KH
306 return make_number (c);
307}
308
309DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
310 Smultibyte_char_to_unibyte, 1, 1, 0,
311 doc: /* Convert the multibyte character CH to unibyte character.\n\
312The unibyte character is a result of encoding CH by
313the current primary charset (value of `charset-primary'). */)
314 (ch)
315 Lisp_Object ch;
316{
317 int c;
0168c3d8
KH
318
319 CHECK_CHARACTER (ch);
320 c = XFASTINT (ch);
3c5a53bd
KH
321 c = CHAR_TO_BYTE8 (c);
322 return make_number (c);
0168c3d8
KH
323}
324
325DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
326 doc: /* Return 1 regardless of the argument CHAR.
327This is now an obsolete function. We keep it just for backward compatibility. */)
328 (ch)
329 Lisp_Object ch;
330{
331 CHECK_CHARACTER (ch);
332 return make_number (1);
333}
334
335DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
336 doc: /* Return width of CHAR when displayed in the current buffer.
337The width is measured by how many columns it occupies on the screen.
338Tab is taken to occupy `tab-width' columns. */)
339 (ch)
340 Lisp_Object ch;
341{
342 Lisp_Object disp;
343 int c, width;
344 struct Lisp_Char_Table *dp = buffer_display_table ();
345
346 CHECK_CHARACTER (ch);
347 c = XINT (ch);
348
349 /* Get the way the display table would display it. */
350 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
351
352 if (VECTORP (disp))
353 width = ASIZE (disp);
354 else
355 width = CHAR_WIDTH (c);
356
357 return make_number (width);
358}
359
0168c3d8
KH
360/* Return width of string STR of length LEN when displayed in the
361 current buffer. The width is measured by how many columns it
362 occupies on the screen. If PRECISION > 0, return the width of
363 longest substring that doesn't exceed PRECISION, and set number of
364 characters and bytes of the substring in *NCHARS and *NBYTES
365 respectively. */
366
1889b238 367int
0168c3d8 368c_string_width (str, len, precision, nchars, nbytes)
8f924df7 369 const unsigned char *str;
0168c3d8
KH
370 int precision, *nchars, *nbytes;
371{
372 int i = 0, i_byte = 0;
373 int width = 0;
374 struct Lisp_Char_Table *dp = buffer_display_table ();
375
376 while (i_byte < len)
377 {
378 int bytes, thiswidth;
379 Lisp_Object val;
380 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
381
382 if (dp)
383 {
384 val = DISP_CHAR_VECTOR (dp, c);
385 if (VECTORP (val))
386 thiswidth = XVECTOR (val)->size;
387 else
388 thiswidth = CHAR_WIDTH (c);
389 }
390 else
391 {
392 thiswidth = CHAR_WIDTH (c);
393 }
394
395 if (precision > 0
396 && (width + thiswidth > precision))
397 {
398 *nchars = i;
399 *nbytes = i_byte;
400 return width;
401 }
402 i++;
403 i_byte += bytes;
404 width += thiswidth;
405 }
406
407 if (precision > 0)
408 {
409 *nchars = i;
410 *nbytes = i_byte;
411 }
412
413 return width;
414}
415
1889b238
KH
416/* Return width of string STR of length LEN when displayed in the
417 current buffer. The width is measured by how many columns it
418 occupies on the screen. */
419
420int
421strwidth (str, len)
422 unsigned char *str;
423 int len;
424{
425 return c_string_width (str, len, -1, NULL, NULL);
426}
427
0168c3d8
KH
428/* Return width of Lisp string STRING when displayed in the current
429 buffer. The width is measured by how many columns it occupies on
430 the screen while paying attention to compositions. If PRECISION >
431 0, return the width of longest substring that doesn't exceed
432 PRECISION, and set number of characters and bytes of the substring
433 in *NCHARS and *NBYTES respectively. */
434
435int
436lisp_string_width (string, precision, nchars, nbytes)
437 Lisp_Object string;
438 int precision, *nchars, *nbytes;
439{
8f924df7
KH
440 int len = SCHARS (string);
441 unsigned char *str = SDATA (string);
0168c3d8
KH
442 int i = 0, i_byte = 0;
443 int width = 0;
444 struct Lisp_Char_Table *dp = buffer_display_table ();
445
446 while (i < len)
447 {
448 int chars, bytes, thiswidth;
449 Lisp_Object val;
450 int cmp_id;
f4bc0685 451 EMACS_INT ignore, end;
0168c3d8
KH
452
453 if (find_composition (i, -1, &ignore, &end, &val, string)
454 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
455 >= 0))
456 {
457 thiswidth = composition_table[cmp_id]->width;
458 chars = end - i;
459 bytes = string_char_to_byte (string, end) - i_byte;
460 }
461 else if (dp)
462 {
463 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
464
465 chars = 1;
466 val = DISP_CHAR_VECTOR (dp, c);
467 if (VECTORP (val))
468 thiswidth = XVECTOR (val)->size;
469 else
470 thiswidth = CHAR_WIDTH (c);
471 }
472 else
473 {
474 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
475
476 chars = 1;
477 thiswidth = CHAR_WIDTH (c);
478 }
479
480 if (precision > 0
481 && (width + thiswidth > precision))
482 {
483 *nchars = i;
484 *nbytes = i_byte;
485 return width;
486 }
487 i += chars;
488 i_byte += bytes;
489 width += thiswidth;
490 }
491
492 if (precision > 0)
493 {
494 *nchars = i;
495 *nbytes = i_byte;
496 }
497
498 return width;
499}
500
501DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
502 doc: /* Return width of STRING when displayed in the current buffer.
503Width is measured by how many columns it occupies on the screen.
504When calculating width of a multibyte character in STRING,
505only the base leading-code is considered; the validity of
506the following bytes is not checked. Tabs in STRING are always
507taken to occupy `tab-width' columns. */)
508 (str)
509 Lisp_Object str;
510{
511 Lisp_Object val;
512
513 CHECK_STRING (str);
514 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
515 return val;
516}
517
518DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
519 doc: /* Return the direction of CHAR.
520The returned value is 0 for left-to-right and 1 for right-to-left. */)
521 (ch)
522 Lisp_Object ch;
523{
524 int c;
525
526 CHECK_CHARACTER (ch);
527 c = XINT (ch);
528 return CHAR_TABLE_REF (Vchar_direction_table, c);
529}
530
531DEFUN ("chars-in-region", Fchars_in_region, Schars_in_region, 2, 2, 0,
532 doc: /* Return number of characters between BEG and END.
533This is now an obsolete function. We keep it just for backward compatibility. */)
534 (beg, end)
535 Lisp_Object beg, end;
536{
537 int from, to;
538
539 CHECK_NUMBER_COERCE_MARKER (beg);
540 CHECK_NUMBER_COERCE_MARKER (end);
541
542 from = min (XFASTINT (beg), XFASTINT (end));
543 to = max (XFASTINT (beg), XFASTINT (end));
544
545 return make_number (to - from);
546}
547
548/* Return the number of characters in the NBYTES bytes at PTR.
549 This works by looking at the contents and checking for multibyte
550 sequences while assuming that there's no invalid sequence.
551 However, if the current buffer has enable-multibyte-characters =
552 nil, we treat each byte as a character. */
553
554int
555chars_in_text (ptr, nbytes)
8f924df7 556 const unsigned char *ptr;
0168c3d8
KH
557 int nbytes;
558{
559 /* current_buffer is null at early stages of Emacs initialization. */
560 if (current_buffer == 0
561 || NILP (current_buffer->enable_multibyte_characters))
562 return nbytes;
563
564 return multibyte_chars_in_text (ptr, nbytes);
565}
566
567/* Return the number of characters in the NBYTES bytes at PTR.
568 This works by looking at the contents and checking for multibyte
569 sequences while assuming that there's no invalid sequence. It
570 ignores enable-multibyte-characters. */
571
572int
573multibyte_chars_in_text (ptr, nbytes)
8f924df7 574 const unsigned char *ptr;
0168c3d8
KH
575 int nbytes;
576{
8f924df7 577 const unsigned char *endp = ptr + nbytes;
0168c3d8
KH
578 int chars = 0;
579
580 while (ptr < endp)
581 {
582 int len = MULTIBYTE_LENGTH (ptr, endp);
583
584 if (len == 0)
585 abort ();
586 ptr += len;
587 chars++;
588 }
589
590 return chars;
591}
592
593/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
594 characters and bytes in it, and store them in *NCHARS and *NBYTES
595 respectively. On counting bytes, pay attention to that 8-bit
596 characters not constructing a valid multibyte sequence are
597 represented by 2-byte in a multibyte text. */
598
599void
600parse_str_as_multibyte (str, len, nchars, nbytes)
8f924df7 601 const unsigned char *str;
0168c3d8
KH
602 int len, *nchars, *nbytes;
603{
8f924df7 604 const unsigned char *endp = str + len;
0168c3d8
KH
605 int n, chars = 0, bytes = 0;
606
607 if (len >= MAX_MULTIBYTE_LENGTH)
608 {
8f924df7 609 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
610 while (str < adjusted_endp)
611 {
612 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
613 str += n, bytes += n;
614 else
615 str++, bytes += 2;
616 chars++;
617 }
618 }
619 while (str < endp)
620 {
621 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
622 str += n, bytes += n;
623 else
624 str++, bytes += 2;
625 chars++;
626 }
627
628 *nchars = chars;
629 *nbytes = bytes;
630 return;
631}
632
633/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
634 It actually converts only such 8-bit characters that don't contruct
635 a multibyte sequence to multibyte forms of Latin-1 characters. If
636 NCHARS is nonzero, set *NCHARS to the number of characters in the
637 text. It is assured that we can use LEN bytes at STR as a work
638 area and that is enough. Return the number of bytes of the
639 resulting text. */
640
641int
642str_as_multibyte (str, len, nbytes, nchars)
643 unsigned char *str;
644 int len, nbytes, *nchars;
645{
646 unsigned char *p = str, *endp = str + nbytes;
647 unsigned char *to;
648 int chars = 0;
649 int n;
650
651 if (nbytes >= MAX_MULTIBYTE_LENGTH)
652 {
653 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
654 while (p < adjusted_endp
655 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
656 p += n, chars++;
657 }
658 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
659 p += n, chars++;
660 if (nchars)
661 *nchars = chars;
662 if (p == endp)
663 return nbytes;
664
665 to = p;
666 nbytes = endp - p;
667 endp = str + len;
668 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
669 p = endp - nbytes;
670
671 if (nbytes >= MAX_MULTIBYTE_LENGTH)
672 {
673 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
674 while (p < adjusted_endp)
675 {
676 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
677 {
678 while (n--)
679 *to++ = *p++;
680 }
681 else
682 {
683 int c = *p++;
684 c = BYTE8_TO_CHAR (c);
685 to += CHAR_STRING (c, to);
686 }
687 }
688 chars++;
689 }
690 while (p < endp)
691 {
692 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
693 {
694 while (n--)
695 *to++ = *p++;
8f924df7 696 }
0168c3d8
KH
697 else
698 {
699 int c = *p++;
700 c = BYTE8_TO_CHAR (c);
701 to += CHAR_STRING (c, to);
702 }
703 chars++;
704 }
705 if (nchars)
706 *nchars = chars;
707 return (to - str);
708}
709
710/* Parse unibyte string at STR of LEN bytes, and return the number of
711 bytes it may ocupy when converted to multibyte string by
712 `str_to_multibyte'. */
713
714int
715parse_str_to_multibyte (str, len)
716 unsigned char *str;
717 int len;
718{
719 unsigned char *endp = str + len;
720 int bytes;
721
722 for (bytes = 0; str < endp; str++)
723 bytes += (*str < 0x80) ? 1 : 2;
724 return bytes;
725}
726
727
728/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
729 that contains the same single-byte characters. It actually
730 converts all 8-bit characters to multibyte forms. It is assured
731 that we can use LEN bytes at STR as a work area and that is
732 enough. */
733
734int
735str_to_multibyte (str, len, bytes)
736 unsigned char *str;
737 int len, bytes;
738{
739 unsigned char *p = str, *endp = str + bytes;
740 unsigned char *to;
741
742 while (p < endp && *p < 0x80) p++;
743 if (p == endp)
744 return bytes;
745 to = p;
746 bytes = endp - p;
747 endp = str + len;
748 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
749 p = endp - bytes;
8f924df7 750 while (p < endp)
0168c3d8
KH
751 {
752 int c = *p++;
753
754 if (c >= 0x80)
755 c = BYTE8_TO_CHAR (c);
756 to += CHAR_STRING (c, to);
757 }
758 return (to - str);
759}
760
761/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
762 actually converts characters in the range 0x80..0xFF to
763 unibyte. */
764
765int
766str_as_unibyte (str, bytes)
767 unsigned char *str;
768 int bytes;
769{
15843e6f
KH
770 const unsigned char *p = str, *endp = str + bytes;
771 unsigned char *to;
0168c3d8
KH
772 int c, len;
773
774 while (p < endp)
775 {
776 c = *p;
777 len = BYTES_BY_CHAR_HEAD (c);
778 if (CHAR_BYTE8_HEAD_P (c))
779 break;
780 p += len;
781 }
15843e6f 782 to = str + (p - str);
8f924df7 783 while (p < endp)
0168c3d8
KH
784 {
785 c = *p;
786 len = BYTES_BY_CHAR_HEAD (c);
787 if (CHAR_BYTE8_HEAD_P (c))
788 {
789 c = STRING_CHAR_ADVANCE (p);
790 *to++ = CHAR_TO_BYTE8 (c);
791 }
792 else
793 {
794 while (len--) *to++ = *p++;
795 }
796 }
797 return (to - str);
798}
799
800int
801string_count_byte8 (string)
802 Lisp_Object string;
803{
804 int multibyte = STRING_MULTIBYTE (string);
8f924df7
KH
805 int nbytes = SBYTES (string);
806 unsigned char *p = SDATA (string);
0168c3d8
KH
807 unsigned char *pend = p + nbytes;
808 int count = 0;
809 int c, len;
810
811 if (multibyte)
812 while (p < pend)
813 {
814 c = *p;
815 len = BYTES_BY_CHAR_HEAD (c);
816
817 if (CHAR_BYTE8_HEAD_P (c))
818 count++;
819 p += len;
820 }
821 else
822 while (p < pend)
823 {
824 if (*p++ >= 0x80)
825 count++;
826 }
827 return count;
828}
829
830
831Lisp_Object
832string_escape_byte8 (string)
833 Lisp_Object string;
834{
8f924df7
KH
835 int nchars = SCHARS (string);
836 int nbytes = SBYTES (string);
0168c3d8
KH
837 int multibyte = STRING_MULTIBYTE (string);
838 int byte8_count;
15843e6f
KH
839 const unsigned char *src, *src_end;
840 unsigned char *dst;
0168c3d8
KH
841 Lisp_Object val;
842 int c, len;
843
844 if (multibyte && nchars == nbytes)
845 return string;
846
847 byte8_count = string_count_byte8 (string);
848
849 if (byte8_count == 0)
850 return string;
851
852 if (multibyte)
853 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
7b40ebaf 854 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
0168c3d8
KH
855 nbytes + byte8_count * 2);
856 else
857 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
858 val = make_uninit_string (nbytes + byte8_count * 3);
859
8f924df7 860 src = SDATA (string);
0168c3d8 861 src_end = src + nbytes;
8f924df7 862 dst = SDATA (val);
0168c3d8
KH
863 if (multibyte)
864 while (src < src_end)
865 {
866 c = *src;
867 len = BYTES_BY_CHAR_HEAD (c);
868
869 if (CHAR_BYTE8_HEAD_P (c))
870 {
871 c = STRING_CHAR_ADVANCE (src);
872 c = CHAR_TO_BYTE8 (c);
1889b238 873 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
874 dst += 4;
875 }
876 else
877 while (len--) *dst++ = *src++;
878 }
879 else
880 while (src < src_end)
881 {
882 c = *src++;
883 if (c >= 0x80)
884 {
1889b238 885 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
886 dst += 4;
887 }
888 else
889 *dst++ = c;
890 }
891 return val;
892}
893
894\f
8f924df7 895DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 896 doc: /*
d2e83296
DL
897Concatenate all the argument characters and make the result a string.
898usage: (string &rest CHARACTERS) */)
0168c3d8
KH
899 (n, args)
900 int n;
901 Lisp_Object *args;
902{
903 int i;
904 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
905 unsigned char *p = buf;
906 int c;
907
908 for (i = 0; i < n; i++)
909 {
910 CHECK_CHARACTER (args[i]);
911 c = XINT (args[i]);
912 p += CHAR_STRING (c, p);
913 }
914
915 return make_string_from_bytes ((char *) buf, n, p - buf);
916}
917
918void
919init_character_once ()
920{
921}
922
923#ifdef emacs
924
925void
926syms_of_character ()
927{
928 DEFSYM (Qcharacterp, "characterp");
929 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
930
931 staticpro (&Vchar_unify_table);
932 Vchar_unify_table = Qnil;
933
934 defsubr (&Smax_char);
935 defsubr (&Scharacterp);
936 defsubr (&Sunibyte_char_to_multibyte);
937 defsubr (&Smultibyte_char_to_unibyte);
938 defsubr (&Schar_bytes);
939 defsubr (&Schar_width);
940 defsubr (&Sstring_width);
941 defsubr (&Schar_direction);
942 defsubr (&Schars_in_region);
943 defsubr (&Sstring);
944
945 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
946 doc: /*
68978cf0
DL
947Vector recording all translation tables ever defined.
948Each element is a pair (SYMBOL . TABLE) relating the table to the
949symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
950 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
951
952 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
953 doc: /*
954A char-table for characters which invoke auto-filling.
955Such characters have value t in this table. */);
956 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
957 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
958 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
959
960 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
961 doc: /*
962A char-table for width (columns) of each character. */);
963 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
964 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
965 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
966 make_number (4));
0168c3d8
KH
967
968 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
969 doc: /* A char-table for direction of each character. */);
970 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
971
972 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
973 doc: /* A char-table for each printable character. */);
db6d4189 974 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
975 Fset_char_table_range (Vprintable_chars,
976 Fcons (make_number (32), make_number (126)), Qt);
977 Fset_char_table_range (Vprintable_chars,
978 Fcons (make_number (160),
979 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 980
c57f3328
KH
981 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
982 doc: /* Char table of script symbols.
983It has one extra slot whose value is a list of script symbols. */);
984
985 /* Intern this now in case it isn't already done.
986 Setting this variable twice is harmless.
987 But don't staticpro it here--that is done in alloc.c. */
988 Qchar_table_extra_slots = intern ("char-table-extra-slots");
989 DEFSYM (Qchar_script_table, "char-script-table");
990 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
991 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
0168c3d8
KH
992}
993
994#endif /* emacs */