*** empty log message ***
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
cca4e3b0 4 Copyright (C) 2001, 2005 Free Software Foundation, Inc.
8f924df7 5 Copyright (C) 2003
0168c3d8
KH
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9This file is part of GNU Emacs.
10
11GNU Emacs is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2, or (at your option)
14any later version.
15
16GNU Emacs is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License
22along with GNU Emacs; see the file COPYING. If not, write to
23the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24Boston, MA 02111-1307, USA. */
25
26/* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29#ifdef emacs
30#include <config.h>
31#endif
32
33#include <stdio.h>
34
35#ifdef emacs
36
37#include <sys/types.h>
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
53/* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55Lisp_Object Vtranslation_table_vector;
56
57/* A char-table for characters which may invoke auto-filling. */
58Lisp_Object Vauto_fill_chars;
59
60Lisp_Object Qauto_fill_chars;
61
33f91981
KH
62/* Char-table of information about which character to unify to which
63 Unicode character. */
0168c3d8
KH
64Lisp_Object Vchar_unify_table;
65
66/* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68Lisp_Object Vprintable_chars;
69
70/* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72Lisp_Object Vchar_width_table;
73
74/* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76Lisp_Object Vchar_direction_table;
77
8973478b 78/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 79unsigned char *_fetch_multibyte_char_p;
0168c3d8 80
c57f3328
KH
81/* Char table of scripts. */
82Lisp_Object Vchar_script_table;
83
84static Lisp_Object Qchar_script_table;
85
b672c5ae
KH
86/* Mapping table from unibyte chars to multibyte chars. */
87int unibyte_to_multibyte_table[256];
15843e6f 88
90c9d035
KH
89/* Nth element is 1 iff unibyte char N can be mapped to a multibyte
90 char. */
91char unibyte_has_multibyte_table[256];
92
0168c3d8
KH
93\f
94
33f91981
KH
95/* Store multibyte form of character C at P. If C has modifier bits,
96 handle them appropriately. */
97
0168c3d8 98int
e3d8eb8c 99char_string (c, p)
0168c3d8 100 int c;
1889b238 101 unsigned char *p;
0168c3d8
KH
102{
103 int bytes;
104
e3d8eb8c
KH
105 if (c & CHAR_MODIFIER_MASK)
106 {
cc6dfd2a
KH
107 /* As an non-ASCII character can't have modifier bits, we just
108 ignore the bits. */
109 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
e3d8eb8c
KH
110 {
111 /* For Meta, Shift, and Control modifiers, we need special care. */
112 if (c & CHAR_META)
113 {
114 /* Move the meta bit to the right place for a string. */
115 c = (c & ~CHAR_META) | 0x80;
116 }
117 if (c & CHAR_SHIFT)
118 {
119 /* Shift modifier is valid only with [A-Za-z]. */
120 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
121 c &= ~CHAR_SHIFT;
122 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
123 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
124 }
125 if (c & CHAR_CTL)
126 {
127 /* Simulate the code in lread.c. */
128 /* Allow `\C- ' and `\C-?'. */
129 if (c == (CHAR_CTL | ' '))
130 c = 0;
131 else if (c == (CHAR_CTL | '?'))
132 c = 127;
133 /* ASCII control chars are made from letters (both cases),
134 as well as the non-letters within 0100...0137. */
135 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
136 c &= (037 | (~0177 & ~CHAR_CTL));
137 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
138 c &= (037 | (~0177 & ~CHAR_CTL));
139 }
140 }
141
142 /* If C still has any modifier bits, just ignore it. */
143 c &= ~CHAR_MODIFIER_MASK;
144 }
145
0168c3d8
KH
146 MAYBE_UNIFY_CHAR (c);
147
e3d8eb8c 148 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
149 {
150 bytes = CHAR_STRING (c, p);
151 }
152 else if (c <= MAX_4_BYTE_CHAR)
153 {
154 p[0] = (0xF0 | (c >> 18));
155 p[1] = (0x80 | ((c >> 12) & 0x3F));
156 p[2] = (0x80 | ((c >> 6) & 0x3F));
157 p[3] = (0x80 | (c & 0x3F));
158 bytes = 4;
159 }
e3d8eb8c 160 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
161 {
162 p[0] = 0xF8;
163 p[1] = (0x80 | ((c >> 18) & 0x0F));
164 p[2] = (0x80 | ((c >> 12) & 0x3F));
165 p[3] = (0x80 | ((c >> 6) & 0x3F));
166 p[4] = (0x80 | (c & 0x3F));
167 bytes = 5;
168 }
e3d8eb8c
KH
169 else
170 {
171 c = CHAR_TO_BYTE8 (c);
172 bytes = BYTE8_STRING (c, p);
173 }
1889b238 174
0168c3d8
KH
175 return bytes;
176}
177
178
33f91981
KH
179/* Return a character whose multibyte form is at P. Set LEN is not
180 NULL, it must be a pointer to integer. In that case, set *LEN to
181 the byte length of the multibyte form. If ADVANCED is not NULL, is
182 must be a pointer to unsigned char. In that case, set *ADVANCED to
183 the ending address (i.e. the starting address of the next
184 character) of the multibyte form. */
185
0168c3d8 186int
e3d8eb8c 187string_char (p, advanced, len)
15843e6f
KH
188 const unsigned char *p;
189 const unsigned char **advanced;
0168c3d8
KH
190 int *len;
191{
1889b238 192 int c;
15843e6f 193 const unsigned char *saved_p = p;
0168c3d8
KH
194
195 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
196 {
197 c = STRING_CHAR_ADVANCE (p);
198 }
199 else if (! (*p & 0x08))
200 {
201 c = ((((p)[0] & 0xF) << 18)
202 | (((p)[1] & 0x3F) << 12)
203 | (((p)[2] & 0x3F) << 6)
204 | ((p)[3] & 0x3F));
205 p += 4;
206 }
207 else
208 {
209 c = ((((p)[1] & 0x3F) << 18)
210 | (((p)[2] & 0x3F) << 12)
211 | (((p)[3] & 0x3F) << 6)
212 | ((p)[4] & 0x3F));
213 p += 5;
214 }
215
216 MAYBE_UNIFY_CHAR (c);
217
218 if (len)
219 *len = p - saved_p;
220 if (advanced)
221 *advanced = p;
222 return c;
223}
224
225
226/* Translate character C by translation table TABLE. If C is
227 negative, translate a character specified by CHARSET and CODE. If
228 no translation is found in TABLE, return the untranslated
10453be9
KH
229 character. If TABLE is a list, elements are char tables. In this
230 case, translace C by all tables. */
0168c3d8
KH
231
232int
233translate_char (table, c)
234 Lisp_Object table;
235 int c;
236{
10453be9
KH
237 if (CHAR_TABLE_P (table))
238 {
239 Lisp_Object ch;
240
241 ch = CHAR_TABLE_REF (table, c);
242 if (CHARACTERP (ch))
243 c = XINT (ch);
244 }
245 else
246 {
247 for (; CONSP (table); table = XCDR (table))
248 c = translate_char (XCAR (table), c);
249 }
250 return c;
0168c3d8
KH
251}
252
0168c3d8 253/* Convert the multibyte character C to unibyte 8-bit character based
ac86488b
KH
254 on the current value of charset_unibyte. If dimension of
255 charset_unibyte is more than one, return (C & 0xFF).
0168c3d8
KH
256
257 The argument REV_TBL is now ignored. It will be removed in the
258 future. */
259
260int
261multibyte_char_to_unibyte (c, rev_tbl)
262 int c;
263 Lisp_Object rev_tbl;
264{
b672c5ae
KH
265 struct charset *charset;
266 unsigned c1;
0168c3d8 267
b672c5ae
KH
268 if (CHAR_BYTE8_P (c))
269 return CHAR_TO_BYTE8 (c);
270 charset = CHARSET_FROM_ID (charset_unibyte);
271 c1 = ENCODE_CHAR (charset, c);
0168c3d8
KH
272 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
273}
274
275
276DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
277 doc: /* Return non-nil if OBJECT is a character. */)
278 (object, ignore)
279 Lisp_Object object, ignore;
280{
281 return (CHARACTERP (object) ? Qt : Qnil);
282}
283
284DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
285 doc: /* Return the character of the maximum code. */)
286 ()
287{
288 return make_number (MAX_CHAR);
289}
290
291DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
292 Sunibyte_char_to_multibyte, 1, 1, 0,
293 doc: /* Convert the unibyte character CH to multibyte character.
294The multibyte character is a result of decoding CH by
ed1d5bc0 295the current unibyte charset (see `unibyte-charset'). */)
0168c3d8
KH
296 (ch)
297 Lisp_Object ch;
298{
299 int c;
300 struct charset *charset;
301
302 CHECK_CHARACTER (ch);
303 c = XFASTINT (ch);
304 if (c >= 0400)
305 error ("Invalid unibyte character: %d", c);
ac86488b 306 charset = CHARSET_FROM_ID (charset_unibyte);
0168c3d8
KH
307 c = DECODE_CHAR (charset, c);
308 if (c < 0)
3c5a53bd 309 c = BYTE8_TO_CHAR (XFASTINT (ch));
0168c3d8
KH
310 return make_number (c);
311}
312
313DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
314 Smultibyte_char_to_unibyte, 1, 1, 0,
315 doc: /* Convert the multibyte character CH to unibyte character.\n\
316The unibyte character is a result of encoding CH by
317the current primary charset (value of `charset-primary'). */)
318 (ch)
319 Lisp_Object ch;
320{
321 int c;
0168c3d8
KH
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
3c5a53bd
KH
325 c = CHAR_TO_BYTE8 (c);
326 return make_number (c);
0168c3d8
KH
327}
328
329DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
330 doc: /* Return 1 regardless of the argument CHAR.
331This is now an obsolete function. We keep it just for backward compatibility. */)
332 (ch)
333 Lisp_Object ch;
334{
335 CHECK_CHARACTER (ch);
336 return make_number (1);
337}
338
339DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
340 doc: /* Return width of CHAR when displayed in the current buffer.
341The width is measured by how many columns it occupies on the screen.
342Tab is taken to occupy `tab-width' columns. */)
343 (ch)
344 Lisp_Object ch;
345{
346 Lisp_Object disp;
347 int c, width;
348 struct Lisp_Char_Table *dp = buffer_display_table ();
349
350 CHECK_CHARACTER (ch);
351 c = XINT (ch);
352
353 /* Get the way the display table would display it. */
354 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
355
356 if (VECTORP (disp))
357 width = ASIZE (disp);
358 else
359 width = CHAR_WIDTH (c);
360
361 return make_number (width);
362}
363
0168c3d8
KH
364/* Return width of string STR of length LEN when displayed in the
365 current buffer. The width is measured by how many columns it
366 occupies on the screen. If PRECISION > 0, return the width of
367 longest substring that doesn't exceed PRECISION, and set number of
368 characters and bytes of the substring in *NCHARS and *NBYTES
369 respectively. */
370
1889b238 371int
0168c3d8 372c_string_width (str, len, precision, nchars, nbytes)
8f924df7 373 const unsigned char *str;
0168c3d8
KH
374 int precision, *nchars, *nbytes;
375{
376 int i = 0, i_byte = 0;
377 int width = 0;
378 struct Lisp_Char_Table *dp = buffer_display_table ();
379
380 while (i_byte < len)
381 {
382 int bytes, thiswidth;
383 Lisp_Object val;
384 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
385
386 if (dp)
387 {
388 val = DISP_CHAR_VECTOR (dp, c);
389 if (VECTORP (val))
390 thiswidth = XVECTOR (val)->size;
391 else
392 thiswidth = CHAR_WIDTH (c);
393 }
394 else
395 {
396 thiswidth = CHAR_WIDTH (c);
397 }
398
399 if (precision > 0
400 && (width + thiswidth > precision))
401 {
402 *nchars = i;
403 *nbytes = i_byte;
404 return width;
405 }
406 i++;
407 i_byte += bytes;
408 width += thiswidth;
409 }
410
411 if (precision > 0)
412 {
413 *nchars = i;
414 *nbytes = i_byte;
415 }
416
417 return width;
418}
419
1889b238
KH
420/* Return width of string STR of length LEN when displayed in the
421 current buffer. The width is measured by how many columns it
422 occupies on the screen. */
423
424int
425strwidth (str, len)
426 unsigned char *str;
427 int len;
428{
429 return c_string_width (str, len, -1, NULL, NULL);
430}
431
0168c3d8
KH
432/* Return width of Lisp string STRING when displayed in the current
433 buffer. The width is measured by how many columns it occupies on
434 the screen while paying attention to compositions. If PRECISION >
435 0, return the width of longest substring that doesn't exceed
436 PRECISION, and set number of characters and bytes of the substring
437 in *NCHARS and *NBYTES respectively. */
438
439int
440lisp_string_width (string, precision, nchars, nbytes)
441 Lisp_Object string;
442 int precision, *nchars, *nbytes;
443{
8f924df7
KH
444 int len = SCHARS (string);
445 unsigned char *str = SDATA (string);
0168c3d8
KH
446 int i = 0, i_byte = 0;
447 int width = 0;
448 struct Lisp_Char_Table *dp = buffer_display_table ();
449
450 while (i < len)
451 {
452 int chars, bytes, thiswidth;
453 Lisp_Object val;
454 int cmp_id;
f4bc0685 455 EMACS_INT ignore, end;
0168c3d8
KH
456
457 if (find_composition (i, -1, &ignore, &end, &val, string)
458 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
459 >= 0))
460 {
461 thiswidth = composition_table[cmp_id]->width;
462 chars = end - i;
463 bytes = string_char_to_byte (string, end) - i_byte;
464 }
465 else if (dp)
466 {
467 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
468
469 chars = 1;
470 val = DISP_CHAR_VECTOR (dp, c);
471 if (VECTORP (val))
472 thiswidth = XVECTOR (val)->size;
473 else
474 thiswidth = CHAR_WIDTH (c);
475 }
476 else
477 {
478 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
479
480 chars = 1;
481 thiswidth = CHAR_WIDTH (c);
482 }
483
484 if (precision > 0
485 && (width + thiswidth > precision))
486 {
487 *nchars = i;
488 *nbytes = i_byte;
489 return width;
490 }
491 i += chars;
492 i_byte += bytes;
493 width += thiswidth;
494 }
495
496 if (precision > 0)
497 {
498 *nchars = i;
499 *nbytes = i_byte;
500 }
501
502 return width;
503}
504
505DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
506 doc: /* Return width of STRING when displayed in the current buffer.
507Width is measured by how many columns it occupies on the screen.
508When calculating width of a multibyte character in STRING,
509only the base leading-code is considered; the validity of
510the following bytes is not checked. Tabs in STRING are always
511taken to occupy `tab-width' columns. */)
512 (str)
513 Lisp_Object str;
514{
515 Lisp_Object val;
516
517 CHECK_STRING (str);
518 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
519 return val;
520}
521
522DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
523 doc: /* Return the direction of CHAR.
524The returned value is 0 for left-to-right and 1 for right-to-left. */)
525 (ch)
526 Lisp_Object ch;
527{
528 int c;
529
530 CHECK_CHARACTER (ch);
531 c = XINT (ch);
532 return CHAR_TABLE_REF (Vchar_direction_table, c);
533}
534
0168c3d8
KH
535/* Return the number of characters in the NBYTES bytes at PTR.
536 This works by looking at the contents and checking for multibyte
537 sequences while assuming that there's no invalid sequence.
538 However, if the current buffer has enable-multibyte-characters =
539 nil, we treat each byte as a character. */
540
541int
542chars_in_text (ptr, nbytes)
8f924df7 543 const unsigned char *ptr;
0168c3d8
KH
544 int nbytes;
545{
546 /* current_buffer is null at early stages of Emacs initialization. */
547 if (current_buffer == 0
548 || NILP (current_buffer->enable_multibyte_characters))
549 return nbytes;
550
551 return multibyte_chars_in_text (ptr, nbytes);
552}
553
554/* Return the number of characters in the NBYTES bytes at PTR.
555 This works by looking at the contents and checking for multibyte
556 sequences while assuming that there's no invalid sequence. It
557 ignores enable-multibyte-characters. */
558
559int
560multibyte_chars_in_text (ptr, nbytes)
8f924df7 561 const unsigned char *ptr;
0168c3d8
KH
562 int nbytes;
563{
8f924df7 564 const unsigned char *endp = ptr + nbytes;
0168c3d8
KH
565 int chars = 0;
566
567 while (ptr < endp)
568 {
569 int len = MULTIBYTE_LENGTH (ptr, endp);
570
571 if (len == 0)
572 abort ();
573 ptr += len;
574 chars++;
575 }
576
577 return chars;
578}
579
580/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
581 characters and bytes in it, and store them in *NCHARS and *NBYTES
582 respectively. On counting bytes, pay attention to that 8-bit
583 characters not constructing a valid multibyte sequence are
584 represented by 2-byte in a multibyte text. */
585
586void
587parse_str_as_multibyte (str, len, nchars, nbytes)
8f924df7 588 const unsigned char *str;
0168c3d8
KH
589 int len, *nchars, *nbytes;
590{
8f924df7 591 const unsigned char *endp = str + len;
0168c3d8
KH
592 int n, chars = 0, bytes = 0;
593
594 if (len >= MAX_MULTIBYTE_LENGTH)
595 {
8f924df7 596 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
597 while (str < adjusted_endp)
598 {
599 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
600 str += n, bytes += n;
601 else
602 str++, bytes += 2;
603 chars++;
604 }
605 }
606 while (str < endp)
607 {
608 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
609 str += n, bytes += n;
610 else
611 str++, bytes += 2;
612 chars++;
613 }
614
615 *nchars = chars;
616 *nbytes = bytes;
617 return;
618}
619
620/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
621 It actually converts only such 8-bit characters that don't contruct
622 a multibyte sequence to multibyte forms of Latin-1 characters. If
623 NCHARS is nonzero, set *NCHARS to the number of characters in the
624 text. It is assured that we can use LEN bytes at STR as a work
625 area and that is enough. Return the number of bytes of the
626 resulting text. */
627
628int
629str_as_multibyte (str, len, nbytes, nchars)
630 unsigned char *str;
631 int len, nbytes, *nchars;
632{
633 unsigned char *p = str, *endp = str + nbytes;
634 unsigned char *to;
635 int chars = 0;
636 int n;
637
638 if (nbytes >= MAX_MULTIBYTE_LENGTH)
639 {
640 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
641 while (p < adjusted_endp
642 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
643 p += n, chars++;
644 }
645 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
646 p += n, chars++;
647 if (nchars)
648 *nchars = chars;
649 if (p == endp)
650 return nbytes;
651
652 to = p;
653 nbytes = endp - p;
654 endp = str + len;
655 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
656 p = endp - nbytes;
657
658 if (nbytes >= MAX_MULTIBYTE_LENGTH)
659 {
660 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
661 while (p < adjusted_endp)
662 {
663 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
664 {
665 while (n--)
666 *to++ = *p++;
667 }
668 else
669 {
670 int c = *p++;
671 c = BYTE8_TO_CHAR (c);
672 to += CHAR_STRING (c, to);
673 }
674 }
675 chars++;
676 }
677 while (p < endp)
678 {
679 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
680 {
681 while (n--)
682 *to++ = *p++;
8f924df7 683 }
0168c3d8
KH
684 else
685 {
686 int c = *p++;
687 c = BYTE8_TO_CHAR (c);
688 to += CHAR_STRING (c, to);
689 }
690 chars++;
691 }
692 if (nchars)
693 *nchars = chars;
694 return (to - str);
695}
696
697/* Parse unibyte string at STR of LEN bytes, and return the number of
698 bytes it may ocupy when converted to multibyte string by
699 `str_to_multibyte'. */
700
701int
702parse_str_to_multibyte (str, len)
703 unsigned char *str;
704 int len;
705{
706 unsigned char *endp = str + len;
707 int bytes;
708
709 for (bytes = 0; str < endp; str++)
710 bytes += (*str < 0x80) ? 1 : 2;
711 return bytes;
712}
713
714
715/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
716 that contains the same single-byte characters. It actually
717 converts all 8-bit characters to multibyte forms. It is assured
718 that we can use LEN bytes at STR as a work area and that is
719 enough. */
720
721int
722str_to_multibyte (str, len, bytes)
723 unsigned char *str;
724 int len, bytes;
725{
726 unsigned char *p = str, *endp = str + bytes;
727 unsigned char *to;
728
729 while (p < endp && *p < 0x80) p++;
730 if (p == endp)
731 return bytes;
732 to = p;
733 bytes = endp - p;
734 endp = str + len;
735 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
736 p = endp - bytes;
8f924df7 737 while (p < endp)
0168c3d8
KH
738 {
739 int c = *p++;
740
741 if (c >= 0x80)
742 c = BYTE8_TO_CHAR (c);
743 to += CHAR_STRING (c, to);
744 }
745 return (to - str);
746}
747
748/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
749 actually converts characters in the range 0x80..0xFF to
750 unibyte. */
751
752int
753str_as_unibyte (str, bytes)
754 unsigned char *str;
755 int bytes;
756{
15843e6f
KH
757 const unsigned char *p = str, *endp = str + bytes;
758 unsigned char *to;
0168c3d8
KH
759 int c, len;
760
761 while (p < endp)
762 {
763 c = *p;
764 len = BYTES_BY_CHAR_HEAD (c);
765 if (CHAR_BYTE8_HEAD_P (c))
766 break;
767 p += len;
768 }
15843e6f 769 to = str + (p - str);
8f924df7 770 while (p < endp)
0168c3d8
KH
771 {
772 c = *p;
773 len = BYTES_BY_CHAR_HEAD (c);
774 if (CHAR_BYTE8_HEAD_P (c))
775 {
776 c = STRING_CHAR_ADVANCE (p);
777 *to++ = CHAR_TO_BYTE8 (c);
778 }
779 else
780 {
781 while (len--) *to++ = *p++;
782 }
783 }
784 return (to - str);
785}
786
787int
788string_count_byte8 (string)
789 Lisp_Object string;
790{
791 int multibyte = STRING_MULTIBYTE (string);
8f924df7
KH
792 int nbytes = SBYTES (string);
793 unsigned char *p = SDATA (string);
0168c3d8
KH
794 unsigned char *pend = p + nbytes;
795 int count = 0;
796 int c, len;
797
798 if (multibyte)
799 while (p < pend)
800 {
801 c = *p;
802 len = BYTES_BY_CHAR_HEAD (c);
803
804 if (CHAR_BYTE8_HEAD_P (c))
805 count++;
806 p += len;
807 }
808 else
809 while (p < pend)
810 {
811 if (*p++ >= 0x80)
812 count++;
813 }
814 return count;
815}
816
817
818Lisp_Object
819string_escape_byte8 (string)
820 Lisp_Object string;
821{
8f924df7
KH
822 int nchars = SCHARS (string);
823 int nbytes = SBYTES (string);
0168c3d8
KH
824 int multibyte = STRING_MULTIBYTE (string);
825 int byte8_count;
15843e6f
KH
826 const unsigned char *src, *src_end;
827 unsigned char *dst;
0168c3d8
KH
828 Lisp_Object val;
829 int c, len;
830
831 if (multibyte && nchars == nbytes)
832 return string;
833
834 byte8_count = string_count_byte8 (string);
835
836 if (byte8_count == 0)
837 return string;
838
839 if (multibyte)
840 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
7b40ebaf 841 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
0168c3d8
KH
842 nbytes + byte8_count * 2);
843 else
844 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
845 val = make_uninit_string (nbytes + byte8_count * 3);
846
8f924df7 847 src = SDATA (string);
0168c3d8 848 src_end = src + nbytes;
8f924df7 849 dst = SDATA (val);
0168c3d8
KH
850 if (multibyte)
851 while (src < src_end)
852 {
853 c = *src;
854 len = BYTES_BY_CHAR_HEAD (c);
855
856 if (CHAR_BYTE8_HEAD_P (c))
857 {
858 c = STRING_CHAR_ADVANCE (src);
859 c = CHAR_TO_BYTE8 (c);
1889b238 860 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
861 dst += 4;
862 }
863 else
864 while (len--) *dst++ = *src++;
865 }
866 else
867 while (src < src_end)
868 {
869 c = *src++;
870 if (c >= 0x80)
871 {
1889b238 872 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
873 dst += 4;
874 }
875 else
876 *dst++ = c;
877 }
878 return val;
879}
880
881\f
8f924df7 882DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 883 doc: /*
d2e83296
DL
884Concatenate all the argument characters and make the result a string.
885usage: (string &rest CHARACTERS) */)
0168c3d8
KH
886 (n, args)
887 int n;
888 Lisp_Object *args;
889{
890 int i;
891 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
892 unsigned char *p = buf;
893 int c;
894
895 for (i = 0; i < n; i++)
896 {
897 CHECK_CHARACTER (args[i]);
898 c = XINT (args[i]);
899 p += CHAR_STRING (c, p);
900 }
901
902 return make_string_from_bytes ((char *) buf, n, p - buf);
903}
904
905void
906init_character_once ()
907{
908}
909
910#ifdef emacs
911
912void
913syms_of_character ()
914{
915 DEFSYM (Qcharacterp, "characterp");
916 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
917
918 staticpro (&Vchar_unify_table);
919 Vchar_unify_table = Qnil;
920
921 defsubr (&Smax_char);
922 defsubr (&Scharacterp);
923 defsubr (&Sunibyte_char_to_multibyte);
924 defsubr (&Smultibyte_char_to_unibyte);
925 defsubr (&Schar_bytes);
926 defsubr (&Schar_width);
927 defsubr (&Sstring_width);
928 defsubr (&Schar_direction);
0168c3d8
KH
929 defsubr (&Sstring);
930
931 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
932 doc: /*
68978cf0
DL
933Vector recording all translation tables ever defined.
934Each element is a pair (SYMBOL . TABLE) relating the table to the
935symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
936 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
937
938 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
939 doc: /*
940A char-table for characters which invoke auto-filling.
941Such characters have value t in this table. */);
942 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
943 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
944 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
945
946 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
947 doc: /*
948A char-table for width (columns) of each character. */);
949 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
950 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
951 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
952 make_number (4));
0168c3d8
KH
953
954 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
955 doc: /* A char-table for direction of each character. */);
956 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
957
958 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
959 doc: /* A char-table for each printable character. */);
db6d4189 960 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
961 Fset_char_table_range (Vprintable_chars,
962 Fcons (make_number (32), make_number (126)), Qt);
963 Fset_char_table_range (Vprintable_chars,
964 Fcons (make_number (160),
965 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 966
c57f3328
KH
967 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
968 doc: /* Char table of script symbols.
969It has one extra slot whose value is a list of script symbols. */);
970
971 /* Intern this now in case it isn't already done.
972 Setting this variable twice is harmless.
973 But don't staticpro it here--that is done in alloc.c. */
974 Qchar_table_extra_slots = intern ("char-table-extra-slots");
975 DEFSYM (Qchar_script_table, "char-script-table");
976 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
977 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
0168c3d8
KH
978}
979
980#endif /* emacs */
fbaf0946
MB
981
982/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
983 (do not change this comment) */