Merge from emacs--devo--0
[bpt/emacs.git] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2005, 2006 Free Software Foundation, Inc.
5 Copyright (C) 2003, 2006
6 National Institute of Advanced Industrial Science and Technology (AIST)
7 Registration Number H13PRO009
8
9 This file is part of GNU Emacs.
10
11 GNU Emacs is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2, or (at your option)
14 any later version.
15
16 GNU Emacs is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License
22 along with GNU Emacs; see the file COPYING. If not, write to
23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
24 Boston, MA 02111-1307, USA. */
25
26 /* At first, see the document in `character.h' to understand the code
27 in this file. */
28
29 #ifdef emacs
30 #include <config.h>
31 #endif
32
33 #include <stdio.h>
34
35 #ifdef emacs
36
37 #include <sys/types.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 /* Alist of scripts vs representative characters. */
85 Lisp_Object Vscript_representative_chars;
86
87 static Lisp_Object Qchar_script_table;
88
89 /* Mapping table from unibyte chars to multibyte chars. */
90 int unibyte_to_multibyte_table[256];
91
92 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
93 char. */
94 char unibyte_has_multibyte_table[256];
95
96 \f
97
98 /* Store multibyte form of character C at P. If C has modifier bits,
99 handle them appropriately. */
100
101 int
102 char_string (c, p)
103 unsigned c;
104 unsigned char *p;
105 {
106 int bytes;
107
108 if (c & CHAR_MODIFIER_MASK)
109 {
110 /* As an non-ASCII character can't have modifier bits, we just
111 ignore the bits. */
112 if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
113 {
114 /* For Meta, Shift, and Control modifiers, we need special care. */
115 if (c & CHAR_META)
116 {
117 /* Move the meta bit to the right place for a string. */
118 c = (c & ~CHAR_META) | 0x80;
119 }
120 if (c & CHAR_SHIFT)
121 {
122 /* Shift modifier is valid only with [A-Za-z]. */
123 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
124 c &= ~CHAR_SHIFT;
125 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
126 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
127 }
128 if (c & CHAR_CTL)
129 {
130 /* Simulate the code in lread.c. */
131 /* Allow `\C- ' and `\C-?'. */
132 if (c == (CHAR_CTL | ' '))
133 c = 0;
134 else if (c == (CHAR_CTL | '?'))
135 c = 127;
136 /* ASCII control chars are made from letters (both cases),
137 as well as the non-letters within 0100...0137. */
138 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
139 c &= (037 | (~0177 & ~CHAR_CTL));
140 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
141 c &= (037 | (~0177 & ~CHAR_CTL));
142 }
143 }
144
145 /* If C still has any modifier bits, just ignore it. */
146 c &= ~CHAR_MODIFIER_MASK;
147 }
148
149 MAYBE_UNIFY_CHAR (c);
150
151 if (c <= MAX_3_BYTE_CHAR)
152 {
153 bytes = CHAR_STRING (c, p);
154 }
155 else if (c <= MAX_4_BYTE_CHAR)
156 {
157 p[0] = (0xF0 | (c >> 18));
158 p[1] = (0x80 | ((c >> 12) & 0x3F));
159 p[2] = (0x80 | ((c >> 6) & 0x3F));
160 p[3] = (0x80 | (c & 0x3F));
161 bytes = 4;
162 }
163 else if (c <= MAX_5_BYTE_CHAR)
164 {
165 p[0] = 0xF8;
166 p[1] = (0x80 | ((c >> 18) & 0x0F));
167 p[2] = (0x80 | ((c >> 12) & 0x3F));
168 p[3] = (0x80 | ((c >> 6) & 0x3F));
169 p[4] = (0x80 | (c & 0x3F));
170 bytes = 5;
171 }
172 else if (c <= MAX_CHAR)
173 {
174 c = CHAR_TO_BYTE8 (c);
175 bytes = BYTE8_STRING (c, p);
176 }
177 else
178 error ("Invalid character: %d", c);
179
180 return bytes;
181 }
182
183
184 /* Return a character whose multibyte form is at P. Set LEN is not
185 NULL, it must be a pointer to integer. In that case, set *LEN to
186 the byte length of the multibyte form. If ADVANCED is not NULL, is
187 must be a pointer to unsigned char. In that case, set *ADVANCED to
188 the ending address (i.e. the starting address of the next
189 character) of the multibyte form. */
190
191 int
192 string_char (p, advanced, len)
193 const unsigned char *p;
194 const unsigned char **advanced;
195 int *len;
196 {
197 int c;
198 const unsigned char *saved_p = p;
199
200 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
201 {
202 c = STRING_CHAR_ADVANCE (p);
203 }
204 else if (! (*p & 0x08))
205 {
206 c = ((((p)[0] & 0xF) << 18)
207 | (((p)[1] & 0x3F) << 12)
208 | (((p)[2] & 0x3F) << 6)
209 | ((p)[3] & 0x3F));
210 p += 4;
211 }
212 else
213 {
214 c = ((((p)[1] & 0x3F) << 18)
215 | (((p)[2] & 0x3F) << 12)
216 | (((p)[3] & 0x3F) << 6)
217 | ((p)[4] & 0x3F));
218 p += 5;
219 }
220
221 MAYBE_UNIFY_CHAR (c);
222
223 if (len)
224 *len = p - saved_p;
225 if (advanced)
226 *advanced = p;
227 return c;
228 }
229
230
231 /* Translate character C by translation table TABLE. If C is
232 negative, translate a character specified by CHARSET and CODE. If
233 no translation is found in TABLE, return the untranslated
234 character. If TABLE is a list, elements are char tables. In this
235 case, translace C by all tables. */
236
237 int
238 translate_char (table, c)
239 Lisp_Object table;
240 int c;
241 {
242 if (CHAR_TABLE_P (table))
243 {
244 Lisp_Object ch;
245
246 ch = CHAR_TABLE_REF (table, c);
247 if (CHARACTERP (ch))
248 c = XINT (ch);
249 }
250 else
251 {
252 for (; CONSP (table); table = XCDR (table))
253 c = translate_char (XCAR (table), c);
254 }
255 return c;
256 }
257
258 /* Convert the multibyte character C to unibyte 8-bit character based
259 on the current value of charset_unibyte. If dimension of
260 charset_unibyte is more than one, return (C & 0xFF).
261
262 The argument REV_TBL is now ignored. It will be removed in the
263 future. */
264
265 int
266 multibyte_char_to_unibyte (c, rev_tbl)
267 int c;
268 Lisp_Object rev_tbl;
269 {
270 struct charset *charset;
271 unsigned c1;
272
273 if (CHAR_BYTE8_P (c))
274 return CHAR_TO_BYTE8 (c);
275 charset = CHARSET_FROM_ID (charset_unibyte);
276 c1 = ENCODE_CHAR (charset, c);
277 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
278 }
279
280 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
281 by charset_unibyte. */
282
283 int
284 multibyte_char_to_unibyte_safe (c)
285 int c;
286 {
287 struct charset *charset;
288 unsigned c1;
289
290 if (CHAR_BYTE8_P (c))
291 return CHAR_TO_BYTE8 (c);
292 charset = CHARSET_FROM_ID (charset_unibyte);
293 c1 = ENCODE_CHAR (charset, c);
294 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
295 }
296
297 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
298 doc: /* Return non-nil if OBJECT is a character. */)
299 (object, ignore)
300 Lisp_Object object, ignore;
301 {
302 return (CHARACTERP (object) ? Qt : Qnil);
303 }
304
305 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
306 doc: /* Return the character of the maximum code. */)
307 ()
308 {
309 return make_number (MAX_CHAR);
310 }
311
312 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
313 Sunibyte_char_to_multibyte, 1, 1, 0,
314 doc: /* Convert the unibyte character CH to multibyte character.
315 The multibyte character is a result of decoding CH by
316 the current unibyte charset (see `unibyte-charset'). */)
317 (ch)
318 Lisp_Object ch;
319 {
320 int c;
321 struct charset *charset;
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
325 if (c >= 0400)
326 error ("Invalid unibyte character: %d", c);
327 charset = CHARSET_FROM_ID (charset_unibyte);
328 c = DECODE_CHAR (charset, c);
329 if (c < 0)
330 c = BYTE8_TO_CHAR (XFASTINT (ch));
331 return make_number (c);
332 }
333
334 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
335 Smultibyte_char_to_unibyte, 1, 1, 0,
336 doc: /* Convert the multibyte character CH to unibyte character.\n\
337 The unibyte character is a result of encoding CH by
338 the current primary charset (value of `charset-primary'). */)
339 (ch)
340 Lisp_Object ch;
341 {
342 int c;
343
344 CHECK_CHARACTER (ch);
345 c = XFASTINT (ch);
346 c = CHAR_TO_BYTE8 (c);
347 return make_number (c);
348 }
349
350 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
351 doc: /* Return 1 regardless of the argument CHAR.
352 This is now an obsolete function. We keep it just for backward compatibility. */)
353 (ch)
354 Lisp_Object ch;
355 {
356 CHECK_CHARACTER (ch);
357 return make_number (1);
358 }
359
360 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
361 doc: /* Return width of CHAR when displayed in the current buffer.
362 The width is measured by how many columns it occupies on the screen.
363 Tab is taken to occupy `tab-width' columns. */)
364 (ch)
365 Lisp_Object ch;
366 {
367 Lisp_Object disp;
368 int c, width;
369 struct Lisp_Char_Table *dp = buffer_display_table ();
370
371 CHECK_CHARACTER (ch);
372 c = XINT (ch);
373
374 /* Get the way the display table would display it. */
375 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
376
377 if (VECTORP (disp))
378 width = ASIZE (disp);
379 else
380 width = CHAR_WIDTH (c);
381
382 return make_number (width);
383 }
384
385 /* Return width of string STR of length LEN when displayed in the
386 current buffer. The width is measured by how many columns it
387 occupies on the screen. If PRECISION > 0, return the width of
388 longest substring that doesn't exceed PRECISION, and set number of
389 characters and bytes of the substring in *NCHARS and *NBYTES
390 respectively. */
391
392 int
393 c_string_width (str, len, precision, nchars, nbytes)
394 const unsigned char *str;
395 int precision, *nchars, *nbytes;
396 {
397 int i = 0, i_byte = 0;
398 int width = 0;
399 struct Lisp_Char_Table *dp = buffer_display_table ();
400
401 while (i_byte < len)
402 {
403 int bytes, thiswidth;
404 Lisp_Object val;
405 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
406
407 if (dp)
408 {
409 val = DISP_CHAR_VECTOR (dp, c);
410 if (VECTORP (val))
411 thiswidth = XVECTOR (val)->size;
412 else
413 thiswidth = CHAR_WIDTH (c);
414 }
415 else
416 {
417 thiswidth = CHAR_WIDTH (c);
418 }
419
420 if (precision > 0
421 && (width + thiswidth > precision))
422 {
423 *nchars = i;
424 *nbytes = i_byte;
425 return width;
426 }
427 i++;
428 i_byte += bytes;
429 width += thiswidth;
430 }
431
432 if (precision > 0)
433 {
434 *nchars = i;
435 *nbytes = i_byte;
436 }
437
438 return width;
439 }
440
441 /* Return width of string STR of length LEN when displayed in the
442 current buffer. The width is measured by how many columns it
443 occupies on the screen. */
444
445 int
446 strwidth (str, len)
447 unsigned char *str;
448 int len;
449 {
450 return c_string_width (str, len, -1, NULL, NULL);
451 }
452
453 /* Return width of Lisp string STRING when displayed in the current
454 buffer. The width is measured by how many columns it occupies on
455 the screen while paying attention to compositions. If PRECISION >
456 0, return the width of longest substring that doesn't exceed
457 PRECISION, and set number of characters and bytes of the substring
458 in *NCHARS and *NBYTES respectively. */
459
460 int
461 lisp_string_width (string, precision, nchars, nbytes)
462 Lisp_Object string;
463 int precision, *nchars, *nbytes;
464 {
465 int len = SCHARS (string);
466 /* This set multibyte to 0 even if STRING is multibyte when it
467 contains only ascii and eight-bit-graphic, but that's
468 intentional. */
469 int multibyte = len < SBYTES (string);
470 unsigned char *str = SDATA (string);
471 int i = 0, i_byte = 0;
472 int width = 0;
473 struct Lisp_Char_Table *dp = buffer_display_table ();
474
475 while (i < len)
476 {
477 int chars, bytes, thiswidth;
478 Lisp_Object val;
479 int cmp_id;
480 EMACS_INT ignore, end;
481
482 if (find_composition (i, -1, &ignore, &end, &val, string)
483 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
484 >= 0))
485 {
486 thiswidth = composition_table[cmp_id]->width;
487 chars = end - i;
488 bytes = string_char_to_byte (string, end) - i_byte;
489 }
490 else
491 {
492 int c;
493
494 if (multibyte)
495 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
496 else
497 c = str[i_byte], bytes = 1;
498 chars = 1;
499 if (dp)
500 {
501 val = DISP_CHAR_VECTOR (dp, c);
502 if (VECTORP (val))
503 thiswidth = XVECTOR (val)->size;
504 else
505 thiswidth = CHAR_WIDTH (c);
506 }
507 else
508 {
509 thiswidth = CHAR_WIDTH (c);
510 }
511 }
512
513 if (precision > 0
514 && (width + thiswidth > precision))
515 {
516 *nchars = i;
517 *nbytes = i_byte;
518 return width;
519 }
520 i += chars;
521 i_byte += bytes;
522 width += thiswidth;
523 }
524
525 if (precision > 0)
526 {
527 *nchars = i;
528 *nbytes = i_byte;
529 }
530
531 return width;
532 }
533
534 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
535 doc: /* Return width of STRING when displayed in the current buffer.
536 Width is measured by how many columns it occupies on the screen.
537 When calculating width of a multibyte character in STRING,
538 only the base leading-code is considered; the validity of
539 the following bytes is not checked. Tabs in STRING are always
540 taken to occupy `tab-width' columns. */)
541 (str)
542 Lisp_Object str;
543 {
544 Lisp_Object val;
545
546 CHECK_STRING (str);
547 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
548 return val;
549 }
550
551 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
552 doc: /* Return the direction of CHAR.
553 The returned value is 0 for left-to-right and 1 for right-to-left. */)
554 (ch)
555 Lisp_Object ch;
556 {
557 int c;
558
559 CHECK_CHARACTER (ch);
560 c = XINT (ch);
561 return CHAR_TABLE_REF (Vchar_direction_table, c);
562 }
563
564 /* Return the number of characters in the NBYTES bytes at PTR.
565 This works by looking at the contents and checking for multibyte
566 sequences while assuming that there's no invalid sequence.
567 However, if the current buffer has enable-multibyte-characters =
568 nil, we treat each byte as a character. */
569
570 int
571 chars_in_text (ptr, nbytes)
572 const unsigned char *ptr;
573 int nbytes;
574 {
575 /* current_buffer is null at early stages of Emacs initialization. */
576 if (current_buffer == 0
577 || NILP (current_buffer->enable_multibyte_characters))
578 return nbytes;
579
580 return multibyte_chars_in_text (ptr, nbytes);
581 }
582
583 /* Return the number of characters in the NBYTES bytes at PTR.
584 This works by looking at the contents and checking for multibyte
585 sequences while assuming that there's no invalid sequence. It
586 ignores enable-multibyte-characters. */
587
588 int
589 multibyte_chars_in_text (ptr, nbytes)
590 const unsigned char *ptr;
591 int nbytes;
592 {
593 const unsigned char *endp = ptr + nbytes;
594 int chars = 0;
595
596 while (ptr < endp)
597 {
598 int len = MULTIBYTE_LENGTH (ptr, endp);
599
600 if (len == 0)
601 abort ();
602 ptr += len;
603 chars++;
604 }
605
606 return chars;
607 }
608
609 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
610 characters and bytes in it, and store them in *NCHARS and *NBYTES
611 respectively. On counting bytes, pay attention to that 8-bit
612 characters not constructing a valid multibyte sequence are
613 represented by 2-byte in a multibyte text. */
614
615 void
616 parse_str_as_multibyte (str, len, nchars, nbytes)
617 const unsigned char *str;
618 int len, *nchars, *nbytes;
619 {
620 const unsigned char *endp = str + len;
621 int n, chars = 0, bytes = 0;
622
623 if (len >= MAX_MULTIBYTE_LENGTH)
624 {
625 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
626 while (str < adjusted_endp)
627 {
628 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
629 str += n, bytes += n;
630 else
631 str++, bytes += 2;
632 chars++;
633 }
634 }
635 while (str < endp)
636 {
637 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
638 str += n, bytes += n;
639 else
640 str++, bytes += 2;
641 chars++;
642 }
643
644 *nchars = chars;
645 *nbytes = bytes;
646 return;
647 }
648
649 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
650 It actually converts only such 8-bit characters that don't contruct
651 a multibyte sequence to multibyte forms of Latin-1 characters. If
652 NCHARS is nonzero, set *NCHARS to the number of characters in the
653 text. It is assured that we can use LEN bytes at STR as a work
654 area and that is enough. Return the number of bytes of the
655 resulting text. */
656
657 int
658 str_as_multibyte (str, len, nbytes, nchars)
659 unsigned char *str;
660 int len, nbytes, *nchars;
661 {
662 unsigned char *p = str, *endp = str + nbytes;
663 unsigned char *to;
664 int chars = 0;
665 int n;
666
667 if (nbytes >= MAX_MULTIBYTE_LENGTH)
668 {
669 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
670 while (p < adjusted_endp
671 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
672 p += n, chars++;
673 }
674 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
675 p += n, chars++;
676 if (nchars)
677 *nchars = chars;
678 if (p == endp)
679 return nbytes;
680
681 to = p;
682 nbytes = endp - p;
683 endp = str + len;
684 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
685 p = endp - nbytes;
686
687 if (nbytes >= MAX_MULTIBYTE_LENGTH)
688 {
689 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
690 while (p < adjusted_endp)
691 {
692 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
693 {
694 while (n--)
695 *to++ = *p++;
696 }
697 else
698 {
699 int c = *p++;
700 c = BYTE8_TO_CHAR (c);
701 to += CHAR_STRING (c, to);
702 }
703 }
704 chars++;
705 }
706 while (p < endp)
707 {
708 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
709 {
710 while (n--)
711 *to++ = *p++;
712 }
713 else
714 {
715 int c = *p++;
716 c = BYTE8_TO_CHAR (c);
717 to += CHAR_STRING (c, to);
718 }
719 chars++;
720 }
721 if (nchars)
722 *nchars = chars;
723 return (to - str);
724 }
725
726 /* Parse unibyte string at STR of LEN bytes, and return the number of
727 bytes it may ocupy when converted to multibyte string by
728 `str_to_multibyte'. */
729
730 int
731 parse_str_to_multibyte (str, len)
732 unsigned char *str;
733 int len;
734 {
735 unsigned char *endp = str + len;
736 int bytes;
737
738 for (bytes = 0; str < endp; str++)
739 bytes += (*str < 0x80) ? 1 : 2;
740 return bytes;
741 }
742
743
744 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
745 that contains the same single-byte characters. It actually
746 converts all 8-bit characters to multibyte forms. It is assured
747 that we can use LEN bytes at STR as a work area and that is
748 enough. */
749
750 int
751 str_to_multibyte (str, len, bytes)
752 unsigned char *str;
753 int len, bytes;
754 {
755 unsigned char *p = str, *endp = str + bytes;
756 unsigned char *to;
757
758 while (p < endp && *p < 0x80) p++;
759 if (p == endp)
760 return bytes;
761 to = p;
762 bytes = endp - p;
763 endp = str + len;
764 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
765 p = endp - bytes;
766 while (p < endp)
767 {
768 int c = *p++;
769
770 if (c >= 0x80)
771 c = BYTE8_TO_CHAR (c);
772 to += CHAR_STRING (c, to);
773 }
774 return (to - str);
775 }
776
777 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
778 actually converts characters in the range 0x80..0xFF to
779 unibyte. */
780
781 int
782 str_as_unibyte (str, bytes)
783 unsigned char *str;
784 int bytes;
785 {
786 const unsigned char *p = str, *endp = str + bytes;
787 unsigned char *to;
788 int c, len;
789
790 while (p < endp)
791 {
792 c = *p;
793 len = BYTES_BY_CHAR_HEAD (c);
794 if (CHAR_BYTE8_HEAD_P (c))
795 break;
796 p += len;
797 }
798 to = str + (p - str);
799 while (p < endp)
800 {
801 c = *p;
802 len = BYTES_BY_CHAR_HEAD (c);
803 if (CHAR_BYTE8_HEAD_P (c))
804 {
805 c = STRING_CHAR_ADVANCE (p);
806 *to++ = CHAR_TO_BYTE8 (c);
807 }
808 else
809 {
810 while (len--) *to++ = *p++;
811 }
812 }
813 return (to - str);
814 }
815
816 int
817 string_count_byte8 (string)
818 Lisp_Object string;
819 {
820 int multibyte = STRING_MULTIBYTE (string);
821 int nbytes = SBYTES (string);
822 unsigned char *p = SDATA (string);
823 unsigned char *pend = p + nbytes;
824 int count = 0;
825 int c, len;
826
827 if (multibyte)
828 while (p < pend)
829 {
830 c = *p;
831 len = BYTES_BY_CHAR_HEAD (c);
832
833 if (CHAR_BYTE8_HEAD_P (c))
834 count++;
835 p += len;
836 }
837 else
838 while (p < pend)
839 {
840 if (*p++ >= 0x80)
841 count++;
842 }
843 return count;
844 }
845
846
847 Lisp_Object
848 string_escape_byte8 (string)
849 Lisp_Object string;
850 {
851 int nchars = SCHARS (string);
852 int nbytes = SBYTES (string);
853 int multibyte = STRING_MULTIBYTE (string);
854 int byte8_count;
855 const unsigned char *src, *src_end;
856 unsigned char *dst;
857 Lisp_Object val;
858 int c, len;
859
860 if (multibyte && nchars == nbytes)
861 return string;
862
863 byte8_count = string_count_byte8 (string);
864
865 if (byte8_count == 0)
866 return string;
867
868 if (multibyte)
869 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
870 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
871 nbytes + byte8_count * 2);
872 else
873 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
874 val = make_uninit_string (nbytes + byte8_count * 3);
875
876 src = SDATA (string);
877 src_end = src + nbytes;
878 dst = SDATA (val);
879 if (multibyte)
880 while (src < src_end)
881 {
882 c = *src;
883 len = BYTES_BY_CHAR_HEAD (c);
884
885 if (CHAR_BYTE8_HEAD_P (c))
886 {
887 c = STRING_CHAR_ADVANCE (src);
888 c = CHAR_TO_BYTE8 (c);
889 sprintf ((char *) dst, "\\%03o", c);
890 dst += 4;
891 }
892 else
893 while (len--) *dst++ = *src++;
894 }
895 else
896 while (src < src_end)
897 {
898 c = *src++;
899 if (c >= 0x80)
900 {
901 sprintf ((char *) dst, "\\%03o", c);
902 dst += 4;
903 }
904 else
905 *dst++ = c;
906 }
907 return val;
908 }
909
910 \f
911 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
912 doc: /*
913 Concatenate all the argument characters and make the result a string.
914 usage: (string &rest CHARACTERS) */)
915 (n, args)
916 int n;
917 Lisp_Object *args;
918 {
919 int i;
920 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
921 unsigned char *p = buf;
922 int c;
923
924 for (i = 0; i < n; i++)
925 {
926 CHECK_CHARACTER (args[i]);
927 c = XINT (args[i]);
928 p += CHAR_STRING (c, p);
929 }
930
931 return make_string_from_bytes ((char *) buf, n, p - buf);
932 }
933
934 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
935 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
936 usage: (unibyte-string &rest BYTES) */)
937 (n, args)
938 int n;
939 Lisp_Object *args;
940 {
941 int i;
942 unsigned char *buf = (unsigned char *) alloca (n);
943 unsigned char *p = buf;
944 unsigned c;
945
946 for (i = 0; i < n; i++)
947 {
948 CHECK_NATNUM (args[i]);
949 c = XFASTINT (args[i]);
950 if (c >= 256)
951 args_out_of_range_3 (args[i], make_number (0), make_number (255));
952 *p++ = c;
953 }
954
955 return make_string_from_bytes ((char *) buf, n, p - buf);
956 }
957
958 void
959 init_character_once ()
960 {
961 }
962
963 #ifdef emacs
964
965 void
966 syms_of_character ()
967 {
968 DEFSYM (Qcharacterp, "characterp");
969 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
970
971 staticpro (&Vchar_unify_table);
972 Vchar_unify_table = Qnil;
973
974 defsubr (&Smax_char);
975 defsubr (&Scharacterp);
976 defsubr (&Sunibyte_char_to_multibyte);
977 defsubr (&Smultibyte_char_to_unibyte);
978 defsubr (&Schar_bytes);
979 defsubr (&Schar_width);
980 defsubr (&Sstring_width);
981 defsubr (&Schar_direction);
982 defsubr (&Sstring);
983 defsubr (&Sunibyte_string);
984
985 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
986 doc: /*
987 Vector recording all translation tables ever defined.
988 Each element is a pair (SYMBOL . TABLE) relating the table to the
989 symbol naming it. The ID of a translation table is an index into this vector. */);
990 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
991
992 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
993 doc: /*
994 A char-table for characters which invoke auto-filling.
995 Such characters have value t in this table. */);
996 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
997 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
998 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
999
1000 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1001 doc: /*
1002 A char-table for width (columns) of each character. */);
1003 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1004 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1005 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1006 make_number (4));
1007
1008 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1009 doc: /* A char-table for direction of each character. */);
1010 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1011
1012 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1013 doc: /* A char-table for each printable character. */);
1014 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1015 Fset_char_table_range (Vprintable_chars,
1016 Fcons (make_number (32), make_number (126)), Qt);
1017 Fset_char_table_range (Vprintable_chars,
1018 Fcons (make_number (160),
1019 make_number (MAX_5_BYTE_CHAR)), Qt);
1020
1021 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1022 doc: /* Char table of script symbols.
1023 It has one extra slot whose value is a list of script symbols. */);
1024
1025 /* Intern this now in case it isn't already done.
1026 Setting this variable twice is harmless.
1027 But don't staticpro it here--that is done in alloc.c. */
1028 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1029 DEFSYM (Qchar_script_table, "char-script-table");
1030 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1031 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1032
1033 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1034 doc: /* Alist of scripts vs the representative characters. */);
1035 Vscript_representative_chars = Qnil;
1036 }
1037
1038 #endif /* emacs */
1039
1040 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1041 (do not change this comment) */