(char_resolve_modifier_mask): New function.
[bpt/emacs.git] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 Free Software Foundation, Inc.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10 This file is part of GNU Emacs.
11
12 GNU Emacs is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 3, or (at your option)
15 any later version.
16
17 GNU Emacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with GNU Emacs; see the file COPYING. If not, write to the
24 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25 Boston, MA 02110-1301, USA. */
26
27 /* At first, see the document in `character.h' to understand the code
28 in this file. */
29
30 #ifdef emacs
31 #include <config.h>
32 #endif
33
34 #include <stdio.h>
35
36 #ifdef emacs
37
38 #include <sys/types.h>
39 #include "lisp.h"
40 #include "character.h"
41 #include "buffer.h"
42 #include "charset.h"
43 #include "composite.h"
44 #include "disptab.h"
45
46 #else /* not emacs */
47
48 #include "mulelib.h"
49
50 #endif /* emacs */
51
52 Lisp_Object Qcharacterp;
53
54 /* Vector of translation table ever defined.
55 ID of a translation table is used to index this vector. */
56 Lisp_Object Vtranslation_table_vector;
57
58 /* A char-table for characters which may invoke auto-filling. */
59 Lisp_Object Vauto_fill_chars;
60
61 Lisp_Object Qauto_fill_chars;
62
63 /* Char-table of information about which character to unify to which
64 Unicode character. */
65 Lisp_Object Vchar_unify_table;
66
67 /* A char-table. An element is non-nil iff the corresponding
68 character has a printable glyph. */
69 Lisp_Object Vprintable_chars;
70
71 /* A char-table. An elemnent is a column-width of the corresponding
72 character. */
73 Lisp_Object Vchar_width_table;
74
75 /* A char-table. An element is a symbol indicating the direction
76 property of corresponding character. */
77 Lisp_Object Vchar_direction_table;
78
79 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
80 unsigned char *_fetch_multibyte_char_p;
81
82 /* Char table of scripts. */
83 Lisp_Object Vchar_script_table;
84
85 /* Alist of scripts vs representative characters. */
86 Lisp_Object Vscript_representative_chars;
87
88 static Lisp_Object Qchar_script_table;
89
90 /* Mapping table from unibyte chars to multibyte chars. */
91 int unibyte_to_multibyte_table[256];
92
93 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
94 char. */
95 char unibyte_has_multibyte_table[256];
96
97 \f
98
99 /* If character code C has modifier masks, reflect them to the
100 character code if possible. Return the resulting code. */
101
102 int
103 char_resolve_modifier_mask (c)
104 int c;
105 {
106 /* An non-ASCII character can't reflect modifier bits to the code. */
107 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
108 return c;
109
110 /* For Meta, Shift, and Control modifiers, we need special care. */
111 if (c & CHAR_META)
112 {
113 /* Move the meta bit to the right place for a string. */
114 c = (c & ~CHAR_META) | 0x80;
115 }
116 if (c & CHAR_SHIFT)
117 {
118 /* Shift modifier is valid only with [A-Za-z]. */
119 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
120 c &= ~CHAR_SHIFT;
121 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
122 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
123 }
124 if (c & CHAR_CTL)
125 {
126 /* Simulate the code in lread.c. */
127 /* Allow `\C- ' and `\C-?'. */
128 if ((c & ~CHAR_CTL) == ' ')
129 c = 0;
130 else if ((c & ~CHAR_CTL) == '?')
131 c = 127;
132 /* ASCII control chars are made from letters (both cases),
133 as well as the non-letters within 0100...0137. */
134 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
135 c &= (037 | (~0177 & ~CHAR_CTL));
136 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
137 c &= (037 | (~0177 & ~CHAR_CTL));
138 }
139
140 return c;
141 }
142
143
144 /* Store multibyte form of character C at P. If C has modifier bits,
145 handle them appropriately. */
146
147 int
148 char_string (c, p)
149 unsigned c;
150 unsigned char *p;
151 {
152 int bytes;
153
154 if (c & CHAR_MODIFIER_MASK)
155 {
156 c = (unsigned) char_resolve_modifier_mask ((int) c);
157 /* If C still has any modifier bits, just ignore it. */
158 c &= ~CHAR_MODIFIER_MASK;
159 }
160
161 MAYBE_UNIFY_CHAR (c);
162
163 if (c <= MAX_3_BYTE_CHAR)
164 {
165 bytes = CHAR_STRING (c, p);
166 }
167 else if (c <= MAX_4_BYTE_CHAR)
168 {
169 p[0] = (0xF0 | (c >> 18));
170 p[1] = (0x80 | ((c >> 12) & 0x3F));
171 p[2] = (0x80 | ((c >> 6) & 0x3F));
172 p[3] = (0x80 | (c & 0x3F));
173 bytes = 4;
174 }
175 else if (c <= MAX_5_BYTE_CHAR)
176 {
177 p[0] = 0xF8;
178 p[1] = (0x80 | ((c >> 18) & 0x0F));
179 p[2] = (0x80 | ((c >> 12) & 0x3F));
180 p[3] = (0x80 | ((c >> 6) & 0x3F));
181 p[4] = (0x80 | (c & 0x3F));
182 bytes = 5;
183 }
184 else if (c <= MAX_CHAR)
185 {
186 c = CHAR_TO_BYTE8 (c);
187 bytes = BYTE8_STRING (c, p);
188 }
189 else
190 error ("Invalid character: %d", c);
191
192 return bytes;
193 }
194
195
196 /* Return a character whose multibyte form is at P. Set LEN is not
197 NULL, it must be a pointer to integer. In that case, set *LEN to
198 the byte length of the multibyte form. If ADVANCED is not NULL, is
199 must be a pointer to unsigned char. In that case, set *ADVANCED to
200 the ending address (i.e. the starting address of the next
201 character) of the multibyte form. */
202
203 int
204 string_char (p, advanced, len)
205 const unsigned char *p;
206 const unsigned char **advanced;
207 int *len;
208 {
209 int c;
210 const unsigned char *saved_p = p;
211
212 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
213 {
214 c = STRING_CHAR_ADVANCE (p);
215 }
216 else if (! (*p & 0x08))
217 {
218 c = ((((p)[0] & 0xF) << 18)
219 | (((p)[1] & 0x3F) << 12)
220 | (((p)[2] & 0x3F) << 6)
221 | ((p)[3] & 0x3F));
222 p += 4;
223 }
224 else
225 {
226 c = ((((p)[1] & 0x3F) << 18)
227 | (((p)[2] & 0x3F) << 12)
228 | (((p)[3] & 0x3F) << 6)
229 | ((p)[4] & 0x3F));
230 p += 5;
231 }
232
233 MAYBE_UNIFY_CHAR (c);
234
235 if (len)
236 *len = p - saved_p;
237 if (advanced)
238 *advanced = p;
239 return c;
240 }
241
242
243 /* Translate character C by translation table TABLE. If C is
244 negative, translate a character specified by CHARSET and CODE. If
245 no translation is found in TABLE, return the untranslated
246 character. If TABLE is a list, elements are char tables. In this
247 case, translace C by all tables. */
248
249 int
250 translate_char (table, c)
251 Lisp_Object table;
252 int c;
253 {
254 if (CHAR_TABLE_P (table))
255 {
256 Lisp_Object ch;
257
258 ch = CHAR_TABLE_REF (table, c);
259 if (CHARACTERP (ch))
260 c = XINT (ch);
261 }
262 else
263 {
264 for (; CONSP (table); table = XCDR (table))
265 c = translate_char (XCAR (table), c);
266 }
267 return c;
268 }
269
270 /* Convert the multibyte character C to unibyte 8-bit character based
271 on the current value of charset_unibyte. If dimension of
272 charset_unibyte is more than one, return (C & 0xFF).
273
274 The argument REV_TBL is now ignored. It will be removed in the
275 future. */
276
277 int
278 multibyte_char_to_unibyte (c, rev_tbl)
279 int c;
280 Lisp_Object rev_tbl;
281 {
282 struct charset *charset;
283 unsigned c1;
284
285 if (CHAR_BYTE8_P (c))
286 return CHAR_TO_BYTE8 (c);
287 charset = CHARSET_FROM_ID (charset_unibyte);
288 c1 = ENCODE_CHAR (charset, c);
289 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
290 }
291
292 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
293 by charset_unibyte. */
294
295 int
296 multibyte_char_to_unibyte_safe (c)
297 int c;
298 {
299 struct charset *charset;
300 unsigned c1;
301
302 if (CHAR_BYTE8_P (c))
303 return CHAR_TO_BYTE8 (c);
304 charset = CHARSET_FROM_ID (charset_unibyte);
305 c1 = ENCODE_CHAR (charset, c);
306 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
307 }
308
309 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
310 doc: /* Return non-nil if OBJECT is a character. */)
311 (object, ignore)
312 Lisp_Object object, ignore;
313 {
314 return (CHARACTERP (object) ? Qt : Qnil);
315 }
316
317 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
318 doc: /* Return the character of the maximum code. */)
319 ()
320 {
321 return make_number (MAX_CHAR);
322 }
323
324 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
325 Sunibyte_char_to_multibyte, 1, 1, 0,
326 doc: /* Convert the unibyte character CH to multibyte character.
327 The multibyte character is a result of decoding CH by
328 the current unibyte charset (see `unibyte-charset'). */)
329 (ch)
330 Lisp_Object ch;
331 {
332 int c;
333 struct charset *charset;
334
335 CHECK_CHARACTER (ch);
336 c = XFASTINT (ch);
337 if (c >= 0400)
338 error ("Invalid unibyte character: %d", c);
339 charset = CHARSET_FROM_ID (charset_unibyte);
340 c = DECODE_CHAR (charset, c);
341 if (c < 0)
342 c = BYTE8_TO_CHAR (XFASTINT (ch));
343 return make_number (c);
344 }
345
346 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
347 Smultibyte_char_to_unibyte, 1, 1, 0,
348 doc: /* Convert the multibyte character CH to unibyte character.\n\
349 The unibyte character is a result of encoding CH by
350 the current primary charset (value of `charset-primary'). */)
351 (ch)
352 Lisp_Object ch;
353 {
354 int c;
355
356 CHECK_CHARACTER (ch);
357 c = XFASTINT (ch);
358 c = CHAR_TO_BYTE8 (c);
359 return make_number (c);
360 }
361
362 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
363 doc: /* Return 1 regardless of the argument CHAR.
364 This is now an obsolete function. We keep it just for backward compatibility. */)
365 (ch)
366 Lisp_Object ch;
367 {
368 CHECK_CHARACTER (ch);
369 return make_number (1);
370 }
371
372 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
373 doc: /* Return width of CHAR when displayed in the current buffer.
374 The width is measured by how many columns it occupies on the screen.
375 Tab is taken to occupy `tab-width' columns. */)
376 (ch)
377 Lisp_Object ch;
378 {
379 Lisp_Object disp;
380 int c, width;
381 struct Lisp_Char_Table *dp = buffer_display_table ();
382
383 CHECK_CHARACTER (ch);
384 c = XINT (ch);
385
386 /* Get the way the display table would display it. */
387 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
388
389 if (VECTORP (disp))
390 width = ASIZE (disp);
391 else
392 width = CHAR_WIDTH (c);
393
394 return make_number (width);
395 }
396
397 /* Return width of string STR of length LEN when displayed in the
398 current buffer. The width is measured by how many columns it
399 occupies on the screen. If PRECISION > 0, return the width of
400 longest substring that doesn't exceed PRECISION, and set number of
401 characters and bytes of the substring in *NCHARS and *NBYTES
402 respectively. */
403
404 int
405 c_string_width (str, len, precision, nchars, nbytes)
406 const unsigned char *str;
407 int precision, *nchars, *nbytes;
408 {
409 int i = 0, i_byte = 0;
410 int width = 0;
411 struct Lisp_Char_Table *dp = buffer_display_table ();
412
413 while (i_byte < len)
414 {
415 int bytes, thiswidth;
416 Lisp_Object val;
417 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
418
419 if (dp)
420 {
421 val = DISP_CHAR_VECTOR (dp, c);
422 if (VECTORP (val))
423 thiswidth = XVECTOR (val)->size;
424 else
425 thiswidth = CHAR_WIDTH (c);
426 }
427 else
428 {
429 thiswidth = CHAR_WIDTH (c);
430 }
431
432 if (precision > 0
433 && (width + thiswidth > precision))
434 {
435 *nchars = i;
436 *nbytes = i_byte;
437 return width;
438 }
439 i++;
440 i_byte += bytes;
441 width += thiswidth;
442 }
443
444 if (precision > 0)
445 {
446 *nchars = i;
447 *nbytes = i_byte;
448 }
449
450 return width;
451 }
452
453 /* Return width of string STR of length LEN when displayed in the
454 current buffer. The width is measured by how many columns it
455 occupies on the screen. */
456
457 int
458 strwidth (str, len)
459 unsigned char *str;
460 int len;
461 {
462 return c_string_width (str, len, -1, NULL, NULL);
463 }
464
465 /* Return width of Lisp string STRING when displayed in the current
466 buffer. The width is measured by how many columns it occupies on
467 the screen while paying attention to compositions. If PRECISION >
468 0, return the width of longest substring that doesn't exceed
469 PRECISION, and set number of characters and bytes of the substring
470 in *NCHARS and *NBYTES respectively. */
471
472 int
473 lisp_string_width (string, precision, nchars, nbytes)
474 Lisp_Object string;
475 int precision, *nchars, *nbytes;
476 {
477 int len = SCHARS (string);
478 /* This set multibyte to 0 even if STRING is multibyte when it
479 contains only ascii and eight-bit-graphic, but that's
480 intentional. */
481 int multibyte = len < SBYTES (string);
482 unsigned char *str = SDATA (string);
483 int i = 0, i_byte = 0;
484 int width = 0;
485 struct Lisp_Char_Table *dp = buffer_display_table ();
486
487 while (i < len)
488 {
489 int chars, bytes, thiswidth;
490 Lisp_Object val;
491 int cmp_id;
492 EMACS_INT ignore, end;
493
494 if (find_composition (i, -1, &ignore, &end, &val, string)
495 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
496 >= 0))
497 {
498 thiswidth = composition_table[cmp_id]->width;
499 chars = end - i;
500 bytes = string_char_to_byte (string, end) - i_byte;
501 }
502 else
503 {
504 int c;
505
506 if (multibyte)
507 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
508 else
509 c = str[i_byte], bytes = 1;
510 chars = 1;
511 if (dp)
512 {
513 val = DISP_CHAR_VECTOR (dp, c);
514 if (VECTORP (val))
515 thiswidth = XVECTOR (val)->size;
516 else
517 thiswidth = CHAR_WIDTH (c);
518 }
519 else
520 {
521 thiswidth = CHAR_WIDTH (c);
522 }
523 }
524
525 if (precision > 0
526 && (width + thiswidth > precision))
527 {
528 *nchars = i;
529 *nbytes = i_byte;
530 return width;
531 }
532 i += chars;
533 i_byte += bytes;
534 width += thiswidth;
535 }
536
537 if (precision > 0)
538 {
539 *nchars = i;
540 *nbytes = i_byte;
541 }
542
543 return width;
544 }
545
546 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
547 doc: /* Return width of STRING when displayed in the current buffer.
548 Width is measured by how many columns it occupies on the screen.
549 When calculating width of a multibyte character in STRING,
550 only the base leading-code is considered; the validity of
551 the following bytes is not checked. Tabs in STRING are always
552 taken to occupy `tab-width' columns. */)
553 (str)
554 Lisp_Object str;
555 {
556 Lisp_Object val;
557
558 CHECK_STRING (str);
559 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
560 return val;
561 }
562
563 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
564 doc: /* Return the direction of CHAR.
565 The returned value is 0 for left-to-right and 1 for right-to-left. */)
566 (ch)
567 Lisp_Object ch;
568 {
569 int c;
570
571 CHECK_CHARACTER (ch);
572 c = XINT (ch);
573 return CHAR_TABLE_REF (Vchar_direction_table, c);
574 }
575
576 /* Return the number of characters in the NBYTES bytes at PTR.
577 This works by looking at the contents and checking for multibyte
578 sequences while assuming that there's no invalid sequence.
579 However, if the current buffer has enable-multibyte-characters =
580 nil, we treat each byte as a character. */
581
582 EMACS_INT
583 chars_in_text (ptr, nbytes)
584 const unsigned char *ptr;
585 EMACS_INT nbytes;
586 {
587 /* current_buffer is null at early stages of Emacs initialization. */
588 if (current_buffer == 0
589 || NILP (current_buffer->enable_multibyte_characters))
590 return nbytes;
591
592 return multibyte_chars_in_text (ptr, nbytes);
593 }
594
595 /* Return the number of characters in the NBYTES bytes at PTR.
596 This works by looking at the contents and checking for multibyte
597 sequences while assuming that there's no invalid sequence. It
598 ignores enable-multibyte-characters. */
599
600 EMACS_INT
601 multibyte_chars_in_text (ptr, nbytes)
602 const unsigned char *ptr;
603 EMACS_INT nbytes;
604 {
605 const unsigned char *endp = ptr + nbytes;
606 int chars = 0;
607
608 while (ptr < endp)
609 {
610 int len = MULTIBYTE_LENGTH (ptr, endp);
611
612 if (len == 0)
613 abort ();
614 ptr += len;
615 chars++;
616 }
617
618 return chars;
619 }
620
621 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
622 characters and bytes in it, and store them in *NCHARS and *NBYTES
623 respectively. On counting bytes, pay attention to that 8-bit
624 characters not constructing a valid multibyte sequence are
625 represented by 2-byte in a multibyte text. */
626
627 void
628 parse_str_as_multibyte (str, len, nchars, nbytes)
629 const unsigned char *str;
630 int len, *nchars, *nbytes;
631 {
632 const unsigned char *endp = str + len;
633 int n, chars = 0, bytes = 0;
634
635 if (len >= MAX_MULTIBYTE_LENGTH)
636 {
637 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
638 while (str < adjusted_endp)
639 {
640 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
641 str += n, bytes += n;
642 else
643 str++, bytes += 2;
644 chars++;
645 }
646 }
647 while (str < endp)
648 {
649 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
650 str += n, bytes += n;
651 else
652 str++, bytes += 2;
653 chars++;
654 }
655
656 *nchars = chars;
657 *nbytes = bytes;
658 return;
659 }
660
661 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
662 It actually converts only such 8-bit characters that don't contruct
663 a multibyte sequence to multibyte forms of Latin-1 characters. If
664 NCHARS is nonzero, set *NCHARS to the number of characters in the
665 text. It is assured that we can use LEN bytes at STR as a work
666 area and that is enough. Return the number of bytes of the
667 resulting text. */
668
669 int
670 str_as_multibyte (str, len, nbytes, nchars)
671 unsigned char *str;
672 int len, nbytes, *nchars;
673 {
674 unsigned char *p = str, *endp = str + nbytes;
675 unsigned char *to;
676 int chars = 0;
677 int n;
678
679 if (nbytes >= MAX_MULTIBYTE_LENGTH)
680 {
681 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
682 while (p < adjusted_endp
683 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
684 p += n, chars++;
685 }
686 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
687 p += n, chars++;
688 if (nchars)
689 *nchars = chars;
690 if (p == endp)
691 return nbytes;
692
693 to = p;
694 nbytes = endp - p;
695 endp = str + len;
696 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
697 p = endp - nbytes;
698
699 if (nbytes >= MAX_MULTIBYTE_LENGTH)
700 {
701 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
702 while (p < adjusted_endp)
703 {
704 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
705 {
706 while (n--)
707 *to++ = *p++;
708 }
709 else
710 {
711 int c = *p++;
712 c = BYTE8_TO_CHAR (c);
713 to += CHAR_STRING (c, to);
714 }
715 }
716 chars++;
717 }
718 while (p < endp)
719 {
720 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
721 {
722 while (n--)
723 *to++ = *p++;
724 }
725 else
726 {
727 int c = *p++;
728 c = BYTE8_TO_CHAR (c);
729 to += CHAR_STRING (c, to);
730 }
731 chars++;
732 }
733 if (nchars)
734 *nchars = chars;
735 return (to - str);
736 }
737
738 /* Parse unibyte string at STR of LEN bytes, and return the number of
739 bytes it may ocupy when converted to multibyte string by
740 `str_to_multibyte'. */
741
742 int
743 parse_str_to_multibyte (str, len)
744 unsigned char *str;
745 int len;
746 {
747 unsigned char *endp = str + len;
748 int bytes;
749
750 for (bytes = 0; str < endp; str++)
751 bytes += (*str < 0x80) ? 1 : 2;
752 return bytes;
753 }
754
755
756 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
757 that contains the same single-byte characters. It actually
758 converts all 8-bit characters to multibyte forms. It is assured
759 that we can use LEN bytes at STR as a work area and that is
760 enough. */
761
762 int
763 str_to_multibyte (str, len, bytes)
764 unsigned char *str;
765 int len, bytes;
766 {
767 unsigned char *p = str, *endp = str + bytes;
768 unsigned char *to;
769
770 while (p < endp && *p < 0x80) p++;
771 if (p == endp)
772 return bytes;
773 to = p;
774 bytes = endp - p;
775 endp = str + len;
776 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
777 p = endp - bytes;
778 while (p < endp)
779 {
780 int c = *p++;
781
782 if (c >= 0x80)
783 c = BYTE8_TO_CHAR (c);
784 to += CHAR_STRING (c, to);
785 }
786 return (to - str);
787 }
788
789 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
790 actually converts characters in the range 0x80..0xFF to
791 unibyte. */
792
793 int
794 str_as_unibyte (str, bytes)
795 unsigned char *str;
796 int bytes;
797 {
798 const unsigned char *p = str, *endp = str + bytes;
799 unsigned char *to;
800 int c, len;
801
802 while (p < endp)
803 {
804 c = *p;
805 len = BYTES_BY_CHAR_HEAD (c);
806 if (CHAR_BYTE8_HEAD_P (c))
807 break;
808 p += len;
809 }
810 to = str + (p - str);
811 while (p < endp)
812 {
813 c = *p;
814 len = BYTES_BY_CHAR_HEAD (c);
815 if (CHAR_BYTE8_HEAD_P (c))
816 {
817 c = STRING_CHAR_ADVANCE (p);
818 *to++ = CHAR_TO_BYTE8 (c);
819 }
820 else
821 {
822 while (len--) *to++ = *p++;
823 }
824 }
825 return (to - str);
826 }
827
828 int
829 string_count_byte8 (string)
830 Lisp_Object string;
831 {
832 int multibyte = STRING_MULTIBYTE (string);
833 int nbytes = SBYTES (string);
834 unsigned char *p = SDATA (string);
835 unsigned char *pend = p + nbytes;
836 int count = 0;
837 int c, len;
838
839 if (multibyte)
840 while (p < pend)
841 {
842 c = *p;
843 len = BYTES_BY_CHAR_HEAD (c);
844
845 if (CHAR_BYTE8_HEAD_P (c))
846 count++;
847 p += len;
848 }
849 else
850 while (p < pend)
851 {
852 if (*p++ >= 0x80)
853 count++;
854 }
855 return count;
856 }
857
858
859 Lisp_Object
860 string_escape_byte8 (string)
861 Lisp_Object string;
862 {
863 int nchars = SCHARS (string);
864 int nbytes = SBYTES (string);
865 int multibyte = STRING_MULTIBYTE (string);
866 int byte8_count;
867 const unsigned char *src, *src_end;
868 unsigned char *dst;
869 Lisp_Object val;
870 int c, len;
871
872 if (multibyte && nchars == nbytes)
873 return string;
874
875 byte8_count = string_count_byte8 (string);
876
877 if (byte8_count == 0)
878 return string;
879
880 if (multibyte)
881 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
882 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
883 nbytes + byte8_count * 2);
884 else
885 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
886 val = make_uninit_string (nbytes + byte8_count * 3);
887
888 src = SDATA (string);
889 src_end = src + nbytes;
890 dst = SDATA (val);
891 if (multibyte)
892 while (src < src_end)
893 {
894 c = *src;
895 len = BYTES_BY_CHAR_HEAD (c);
896
897 if (CHAR_BYTE8_HEAD_P (c))
898 {
899 c = STRING_CHAR_ADVANCE (src);
900 c = CHAR_TO_BYTE8 (c);
901 sprintf ((char *) dst, "\\%03o", c);
902 dst += 4;
903 }
904 else
905 while (len--) *dst++ = *src++;
906 }
907 else
908 while (src < src_end)
909 {
910 c = *src++;
911 if (c >= 0x80)
912 {
913 sprintf ((char *) dst, "\\%03o", c);
914 dst += 4;
915 }
916 else
917 *dst++ = c;
918 }
919 return val;
920 }
921
922 \f
923 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
924 doc: /*
925 Concatenate all the argument characters and make the result a string.
926 usage: (string &rest CHARACTERS) */)
927 (n, args)
928 int n;
929 Lisp_Object *args;
930 {
931 int i;
932 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
933 unsigned char *p = buf;
934 int c;
935
936 for (i = 0; i < n; i++)
937 {
938 CHECK_CHARACTER (args[i]);
939 c = XINT (args[i]);
940 p += CHAR_STRING (c, p);
941 }
942
943 return make_string_from_bytes ((char *) buf, n, p - buf);
944 }
945
946 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
947 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
948 usage: (unibyte-string &rest BYTES) */)
949 (n, args)
950 int n;
951 Lisp_Object *args;
952 {
953 int i;
954 unsigned char *buf = (unsigned char *) alloca (n);
955 unsigned char *p = buf;
956 unsigned c;
957
958 for (i = 0; i < n; i++)
959 {
960 CHECK_NATNUM (args[i]);
961 c = XFASTINT (args[i]);
962 if (c >= 256)
963 args_out_of_range_3 (args[i], make_number (0), make_number (255));
964 *p++ = c;
965 }
966
967 return make_string_from_bytes ((char *) buf, n, p - buf);
968 }
969
970 void
971 init_character_once ()
972 {
973 }
974
975 #ifdef emacs
976
977 void
978 syms_of_character ()
979 {
980 DEFSYM (Qcharacterp, "characterp");
981 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
982
983 staticpro (&Vchar_unify_table);
984 Vchar_unify_table = Qnil;
985
986 defsubr (&Smax_char);
987 defsubr (&Scharacterp);
988 defsubr (&Sunibyte_char_to_multibyte);
989 defsubr (&Smultibyte_char_to_unibyte);
990 defsubr (&Schar_bytes);
991 defsubr (&Schar_width);
992 defsubr (&Sstring_width);
993 defsubr (&Schar_direction);
994 defsubr (&Sstring);
995 defsubr (&Sunibyte_string);
996
997 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
998 doc: /*
999 Vector recording all translation tables ever defined.
1000 Each element is a pair (SYMBOL . TABLE) relating the table to the
1001 symbol naming it. The ID of a translation table is an index into this vector. */);
1002 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1003
1004 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1005 doc: /*
1006 A char-table for characters which invoke auto-filling.
1007 Such characters have value t in this table. */);
1008 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1009 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1010 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1011
1012 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1013 doc: /*
1014 A char-table for width (columns) of each character. */);
1015 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1016 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1017 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1018 make_number (4));
1019
1020 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1021 doc: /* A char-table for direction of each character. */);
1022 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1023
1024 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1025 doc: /* A char-table for each printable character. */);
1026 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1027 Fset_char_table_range (Vprintable_chars,
1028 Fcons (make_number (32), make_number (126)), Qt);
1029 Fset_char_table_range (Vprintable_chars,
1030 Fcons (make_number (160),
1031 make_number (MAX_5_BYTE_CHAR)), Qt);
1032
1033 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1034 doc: /* Char table of script symbols.
1035 It has one extra slot whose value is a list of script symbols. */);
1036
1037 /* Intern this now in case it isn't already done.
1038 Setting this variable twice is harmless.
1039 But don't staticpro it here--that is done in alloc.c. */
1040 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1041 DEFSYM (Qchar_script_table, "char-script-table");
1042 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1043 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1044
1045 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1046 doc: /* Alist of scripts vs the representative characters. */);
1047 Vscript_representative_chars = Qnil;
1048 }
1049
1050 #endif /* emacs */
1051
1052 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1053 (do not change this comment) */