(char_resolve_modifier_mask): Fix previous change
[bpt/emacs.git] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
5 Free Software Foundation, Inc.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10 This file is part of GNU Emacs.
11
12 GNU Emacs is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 3, or (at your option)
15 any later version.
16
17 GNU Emacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with GNU Emacs; see the file COPYING. If not, write to the
24 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
25 Boston, MA 02110-1301, USA. */
26
27 /* At first, see the document in `character.h' to understand the code
28 in this file. */
29
30 #ifdef emacs
31 #include <config.h>
32 #endif
33
34 #include <stdio.h>
35
36 #ifdef emacs
37
38 #include <sys/types.h>
39 #include "lisp.h"
40 #include "character.h"
41 #include "buffer.h"
42 #include "charset.h"
43 #include "composite.h"
44 #include "disptab.h"
45
46 #else /* not emacs */
47
48 #include "mulelib.h"
49
50 #endif /* emacs */
51
52 Lisp_Object Qcharacterp;
53
54 /* Vector of translation table ever defined.
55 ID of a translation table is used to index this vector. */
56 Lisp_Object Vtranslation_table_vector;
57
58 /* A char-table for characters which may invoke auto-filling. */
59 Lisp_Object Vauto_fill_chars;
60
61 Lisp_Object Qauto_fill_chars;
62
63 /* Char-table of information about which character to unify to which
64 Unicode character. */
65 Lisp_Object Vchar_unify_table;
66
67 /* A char-table. An element is non-nil iff the corresponding
68 character has a printable glyph. */
69 Lisp_Object Vprintable_chars;
70
71 /* A char-table. An elemnent is a column-width of the corresponding
72 character. */
73 Lisp_Object Vchar_width_table;
74
75 /* A char-table. An element is a symbol indicating the direction
76 property of corresponding character. */
77 Lisp_Object Vchar_direction_table;
78
79 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
80 unsigned char *_fetch_multibyte_char_p;
81
82 /* Char table of scripts. */
83 Lisp_Object Vchar_script_table;
84
85 /* Alist of scripts vs representative characters. */
86 Lisp_Object Vscript_representative_chars;
87
88 static Lisp_Object Qchar_script_table;
89
90 /* Mapping table from unibyte chars to multibyte chars. */
91 int unibyte_to_multibyte_table[256];
92
93 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
94 char. */
95 char unibyte_has_multibyte_table[256];
96
97 \f
98
99 /* If character code C has modifier masks, reflect them to the
100 character code if possible. Return the resulting code. */
101
102 int
103 char_resolve_modifier_mask (c)
104 int c;
105 {
106 /* A non-ASCII character can't reflect modifier bits to the code. */
107 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
108 return c;
109
110 /* For Meta, Shift, and Control modifiers, we need special care. */
111 if (c & CHAR_SHIFT)
112 {
113 /* Shift modifier is valid only with [A-Za-z]. */
114 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
115 c &= ~CHAR_SHIFT;
116 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
117 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
118 /* Shift modifier with ASCII control characters should be
119 ignored. */
120 else if ((c & ~CHAR_MODIFIER_MASK) < 0x20)
121 c &= ~CHAR_SHIFT;
122 }
123 if (c & CHAR_META)
124 {
125 /* Move the meta bit to the right place for a string. */
126 c = (c & ~CHAR_META) | 0x80;
127 }
128 if (c & CHAR_CTL)
129 {
130 /* Simulate the code in lread.c. */
131 /* Allow `\C- ' and `\C-?'. */
132 if ((c & ~CHAR_CTL) == ' ')
133 c = 0;
134 else if ((c & ~CHAR_CTL) == '?')
135 c = 127;
136 /* ASCII control chars are made from letters (both cases),
137 as well as the non-letters within 0100...0137. */
138 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
139 c &= (037 | (~0177 & ~CHAR_CTL));
140 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
141 c &= (037 | (~0177 & ~CHAR_CTL));
142 }
143
144 return c;
145 }
146
147
148 /* Store multibyte form of character C at P. If C has modifier bits,
149 handle them appropriately. */
150
151 int
152 char_string (c, p)
153 unsigned c;
154 unsigned char *p;
155 {
156 int bytes;
157
158 if (c & CHAR_MODIFIER_MASK)
159 {
160 c = (unsigned) char_resolve_modifier_mask ((int) c);
161 /* If C still has any modifier bits, just ignore it. */
162 c &= ~CHAR_MODIFIER_MASK;
163 }
164
165 MAYBE_UNIFY_CHAR (c);
166
167 if (c <= MAX_3_BYTE_CHAR)
168 {
169 bytes = CHAR_STRING (c, p);
170 }
171 else if (c <= MAX_4_BYTE_CHAR)
172 {
173 p[0] = (0xF0 | (c >> 18));
174 p[1] = (0x80 | ((c >> 12) & 0x3F));
175 p[2] = (0x80 | ((c >> 6) & 0x3F));
176 p[3] = (0x80 | (c & 0x3F));
177 bytes = 4;
178 }
179 else if (c <= MAX_5_BYTE_CHAR)
180 {
181 p[0] = 0xF8;
182 p[1] = (0x80 | ((c >> 18) & 0x0F));
183 p[2] = (0x80 | ((c >> 12) & 0x3F));
184 p[3] = (0x80 | ((c >> 6) & 0x3F));
185 p[4] = (0x80 | (c & 0x3F));
186 bytes = 5;
187 }
188 else if (c <= MAX_CHAR)
189 {
190 c = CHAR_TO_BYTE8 (c);
191 bytes = BYTE8_STRING (c, p);
192 }
193 else
194 error ("Invalid character: %d", c);
195
196 return bytes;
197 }
198
199
200 /* Return a character whose multibyte form is at P. Set LEN is not
201 NULL, it must be a pointer to integer. In that case, set *LEN to
202 the byte length of the multibyte form. If ADVANCED is not NULL, is
203 must be a pointer to unsigned char. In that case, set *ADVANCED to
204 the ending address (i.e. the starting address of the next
205 character) of the multibyte form. */
206
207 int
208 string_char (p, advanced, len)
209 const unsigned char *p;
210 const unsigned char **advanced;
211 int *len;
212 {
213 int c;
214 const unsigned char *saved_p = p;
215
216 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
217 {
218 c = STRING_CHAR_ADVANCE (p);
219 }
220 else if (! (*p & 0x08))
221 {
222 c = ((((p)[0] & 0xF) << 18)
223 | (((p)[1] & 0x3F) << 12)
224 | (((p)[2] & 0x3F) << 6)
225 | ((p)[3] & 0x3F));
226 p += 4;
227 }
228 else
229 {
230 c = ((((p)[1] & 0x3F) << 18)
231 | (((p)[2] & 0x3F) << 12)
232 | (((p)[3] & 0x3F) << 6)
233 | ((p)[4] & 0x3F));
234 p += 5;
235 }
236
237 MAYBE_UNIFY_CHAR (c);
238
239 if (len)
240 *len = p - saved_p;
241 if (advanced)
242 *advanced = p;
243 return c;
244 }
245
246
247 /* Translate character C by translation table TABLE. If C is
248 negative, translate a character specified by CHARSET and CODE. If
249 no translation is found in TABLE, return the untranslated
250 character. If TABLE is a list, elements are char tables. In this
251 case, translace C by all tables. */
252
253 int
254 translate_char (table, c)
255 Lisp_Object table;
256 int c;
257 {
258 if (CHAR_TABLE_P (table))
259 {
260 Lisp_Object ch;
261
262 ch = CHAR_TABLE_REF (table, c);
263 if (CHARACTERP (ch))
264 c = XINT (ch);
265 }
266 else
267 {
268 for (; CONSP (table); table = XCDR (table))
269 c = translate_char (XCAR (table), c);
270 }
271 return c;
272 }
273
274 /* Convert the multibyte character C to unibyte 8-bit character based
275 on the current value of charset_unibyte. If dimension of
276 charset_unibyte is more than one, return (C & 0xFF).
277
278 The argument REV_TBL is now ignored. It will be removed in the
279 future. */
280
281 int
282 multibyte_char_to_unibyte (c, rev_tbl)
283 int c;
284 Lisp_Object rev_tbl;
285 {
286 struct charset *charset;
287 unsigned c1;
288
289 if (CHAR_BYTE8_P (c))
290 return CHAR_TO_BYTE8 (c);
291 charset = CHARSET_FROM_ID (charset_unibyte);
292 c1 = ENCODE_CHAR (charset, c);
293 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
294 }
295
296 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
297 by charset_unibyte. */
298
299 int
300 multibyte_char_to_unibyte_safe (c)
301 int c;
302 {
303 struct charset *charset;
304 unsigned c1;
305
306 if (CHAR_BYTE8_P (c))
307 return CHAR_TO_BYTE8 (c);
308 charset = CHARSET_FROM_ID (charset_unibyte);
309 c1 = ENCODE_CHAR (charset, c);
310 return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
311 }
312
313 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
314 doc: /* Return non-nil if OBJECT is a character. */)
315 (object, ignore)
316 Lisp_Object object, ignore;
317 {
318 return (CHARACTERP (object) ? Qt : Qnil);
319 }
320
321 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
322 doc: /* Return the character of the maximum code. */)
323 ()
324 {
325 return make_number (MAX_CHAR);
326 }
327
328 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
329 Sunibyte_char_to_multibyte, 1, 1, 0,
330 doc: /* Convert the unibyte character CH to multibyte character.
331 The multibyte character is a result of decoding CH by
332 the current unibyte charset (see `unibyte-charset'). */)
333 (ch)
334 Lisp_Object ch;
335 {
336 int c;
337 struct charset *charset;
338
339 CHECK_CHARACTER (ch);
340 c = XFASTINT (ch);
341 if (c >= 0400)
342 error ("Invalid unibyte character: %d", c);
343 charset = CHARSET_FROM_ID (charset_unibyte);
344 c = DECODE_CHAR (charset, c);
345 if (c < 0)
346 c = BYTE8_TO_CHAR (XFASTINT (ch));
347 return make_number (c);
348 }
349
350 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
351 Smultibyte_char_to_unibyte, 1, 1, 0,
352 doc: /* Convert the multibyte character CH to unibyte character.\n\
353 The unibyte character is a result of encoding CH by
354 the current primary charset (value of `charset-primary'). */)
355 (ch)
356 Lisp_Object ch;
357 {
358 int c;
359
360 CHECK_CHARACTER (ch);
361 c = XFASTINT (ch);
362 c = CHAR_TO_BYTE8 (c);
363 return make_number (c);
364 }
365
366 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
367 doc: /* Return 1 regardless of the argument CHAR.
368 This is now an obsolete function. We keep it just for backward compatibility. */)
369 (ch)
370 Lisp_Object ch;
371 {
372 CHECK_CHARACTER (ch);
373 return make_number (1);
374 }
375
376 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
377 doc: /* Return width of CHAR when displayed in the current buffer.
378 The width is measured by how many columns it occupies on the screen.
379 Tab is taken to occupy `tab-width' columns. */)
380 (ch)
381 Lisp_Object ch;
382 {
383 Lisp_Object disp;
384 int c, width;
385 struct Lisp_Char_Table *dp = buffer_display_table ();
386
387 CHECK_CHARACTER (ch);
388 c = XINT (ch);
389
390 /* Get the way the display table would display it. */
391 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
392
393 if (VECTORP (disp))
394 width = ASIZE (disp);
395 else
396 width = CHAR_WIDTH (c);
397
398 return make_number (width);
399 }
400
401 /* Return width of string STR of length LEN when displayed in the
402 current buffer. The width is measured by how many columns it
403 occupies on the screen. If PRECISION > 0, return the width of
404 longest substring that doesn't exceed PRECISION, and set number of
405 characters and bytes of the substring in *NCHARS and *NBYTES
406 respectively. */
407
408 int
409 c_string_width (str, len, precision, nchars, nbytes)
410 const unsigned char *str;
411 int precision, *nchars, *nbytes;
412 {
413 int i = 0, i_byte = 0;
414 int width = 0;
415 struct Lisp_Char_Table *dp = buffer_display_table ();
416
417 while (i_byte < len)
418 {
419 int bytes, thiswidth;
420 Lisp_Object val;
421 int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
422
423 if (dp)
424 {
425 val = DISP_CHAR_VECTOR (dp, c);
426 if (VECTORP (val))
427 thiswidth = XVECTOR (val)->size;
428 else
429 thiswidth = CHAR_WIDTH (c);
430 }
431 else
432 {
433 thiswidth = CHAR_WIDTH (c);
434 }
435
436 if (precision > 0
437 && (width + thiswidth > precision))
438 {
439 *nchars = i;
440 *nbytes = i_byte;
441 return width;
442 }
443 i++;
444 i_byte += bytes;
445 width += thiswidth;
446 }
447
448 if (precision > 0)
449 {
450 *nchars = i;
451 *nbytes = i_byte;
452 }
453
454 return width;
455 }
456
457 /* Return width of string STR of length LEN when displayed in the
458 current buffer. The width is measured by how many columns it
459 occupies on the screen. */
460
461 int
462 strwidth (str, len)
463 unsigned char *str;
464 int len;
465 {
466 return c_string_width (str, len, -1, NULL, NULL);
467 }
468
469 /* Return width of Lisp string STRING when displayed in the current
470 buffer. The width is measured by how many columns it occupies on
471 the screen while paying attention to compositions. If PRECISION >
472 0, return the width of longest substring that doesn't exceed
473 PRECISION, and set number of characters and bytes of the substring
474 in *NCHARS and *NBYTES respectively. */
475
476 int
477 lisp_string_width (string, precision, nchars, nbytes)
478 Lisp_Object string;
479 int precision, *nchars, *nbytes;
480 {
481 int len = SCHARS (string);
482 /* This set multibyte to 0 even if STRING is multibyte when it
483 contains only ascii and eight-bit-graphic, but that's
484 intentional. */
485 int multibyte = len < SBYTES (string);
486 unsigned char *str = SDATA (string);
487 int i = 0, i_byte = 0;
488 int width = 0;
489 struct Lisp_Char_Table *dp = buffer_display_table ();
490
491 while (i < len)
492 {
493 int chars, bytes, thiswidth;
494 Lisp_Object val;
495 int cmp_id;
496 EMACS_INT ignore, end;
497
498 if (find_composition (i, -1, &ignore, &end, &val, string)
499 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
500 >= 0))
501 {
502 thiswidth = composition_table[cmp_id]->width;
503 chars = end - i;
504 bytes = string_char_to_byte (string, end) - i_byte;
505 }
506 else
507 {
508 int c;
509
510 if (multibyte)
511 c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
512 else
513 c = str[i_byte], bytes = 1;
514 chars = 1;
515 if (dp)
516 {
517 val = DISP_CHAR_VECTOR (dp, c);
518 if (VECTORP (val))
519 thiswidth = XVECTOR (val)->size;
520 else
521 thiswidth = CHAR_WIDTH (c);
522 }
523 else
524 {
525 thiswidth = CHAR_WIDTH (c);
526 }
527 }
528
529 if (precision > 0
530 && (width + thiswidth > precision))
531 {
532 *nchars = i;
533 *nbytes = i_byte;
534 return width;
535 }
536 i += chars;
537 i_byte += bytes;
538 width += thiswidth;
539 }
540
541 if (precision > 0)
542 {
543 *nchars = i;
544 *nbytes = i_byte;
545 }
546
547 return width;
548 }
549
550 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
551 doc: /* Return width of STRING when displayed in the current buffer.
552 Width is measured by how many columns it occupies on the screen.
553 When calculating width of a multibyte character in STRING,
554 only the base leading-code is considered; the validity of
555 the following bytes is not checked. Tabs in STRING are always
556 taken to occupy `tab-width' columns. */)
557 (str)
558 Lisp_Object str;
559 {
560 Lisp_Object val;
561
562 CHECK_STRING (str);
563 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
564 return val;
565 }
566
567 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
568 doc: /* Return the direction of CHAR.
569 The returned value is 0 for left-to-right and 1 for right-to-left. */)
570 (ch)
571 Lisp_Object ch;
572 {
573 int c;
574
575 CHECK_CHARACTER (ch);
576 c = XINT (ch);
577 return CHAR_TABLE_REF (Vchar_direction_table, c);
578 }
579
580 /* Return the number of characters in the NBYTES bytes at PTR.
581 This works by looking at the contents and checking for multibyte
582 sequences while assuming that there's no invalid sequence.
583 However, if the current buffer has enable-multibyte-characters =
584 nil, we treat each byte as a character. */
585
586 EMACS_INT
587 chars_in_text (ptr, nbytes)
588 const unsigned char *ptr;
589 EMACS_INT nbytes;
590 {
591 /* current_buffer is null at early stages of Emacs initialization. */
592 if (current_buffer == 0
593 || NILP (current_buffer->enable_multibyte_characters))
594 return nbytes;
595
596 return multibyte_chars_in_text (ptr, nbytes);
597 }
598
599 /* Return the number of characters in the NBYTES bytes at PTR.
600 This works by looking at the contents and checking for multibyte
601 sequences while assuming that there's no invalid sequence. It
602 ignores enable-multibyte-characters. */
603
604 EMACS_INT
605 multibyte_chars_in_text (ptr, nbytes)
606 const unsigned char *ptr;
607 EMACS_INT nbytes;
608 {
609 const unsigned char *endp = ptr + nbytes;
610 int chars = 0;
611
612 while (ptr < endp)
613 {
614 int len = MULTIBYTE_LENGTH (ptr, endp);
615
616 if (len == 0)
617 abort ();
618 ptr += len;
619 chars++;
620 }
621
622 return chars;
623 }
624
625 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
626 characters and bytes in it, and store them in *NCHARS and *NBYTES
627 respectively. On counting bytes, pay attention to that 8-bit
628 characters not constructing a valid multibyte sequence are
629 represented by 2-byte in a multibyte text. */
630
631 void
632 parse_str_as_multibyte (str, len, nchars, nbytes)
633 const unsigned char *str;
634 int len, *nchars, *nbytes;
635 {
636 const unsigned char *endp = str + len;
637 int n, chars = 0, bytes = 0;
638
639 if (len >= MAX_MULTIBYTE_LENGTH)
640 {
641 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
642 while (str < adjusted_endp)
643 {
644 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
645 str += n, bytes += n;
646 else
647 str++, bytes += 2;
648 chars++;
649 }
650 }
651 while (str < endp)
652 {
653 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
654 str += n, bytes += n;
655 else
656 str++, bytes += 2;
657 chars++;
658 }
659
660 *nchars = chars;
661 *nbytes = bytes;
662 return;
663 }
664
665 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
666 It actually converts only such 8-bit characters that don't contruct
667 a multibyte sequence to multibyte forms of Latin-1 characters. If
668 NCHARS is nonzero, set *NCHARS to the number of characters in the
669 text. It is assured that we can use LEN bytes at STR as a work
670 area and that is enough. Return the number of bytes of the
671 resulting text. */
672
673 int
674 str_as_multibyte (str, len, nbytes, nchars)
675 unsigned char *str;
676 int len, nbytes, *nchars;
677 {
678 unsigned char *p = str, *endp = str + nbytes;
679 unsigned char *to;
680 int chars = 0;
681 int n;
682
683 if (nbytes >= MAX_MULTIBYTE_LENGTH)
684 {
685 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
686 while (p < adjusted_endp
687 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
688 p += n, chars++;
689 }
690 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
691 p += n, chars++;
692 if (nchars)
693 *nchars = chars;
694 if (p == endp)
695 return nbytes;
696
697 to = p;
698 nbytes = endp - p;
699 endp = str + len;
700 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
701 p = endp - nbytes;
702
703 if (nbytes >= MAX_MULTIBYTE_LENGTH)
704 {
705 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
706 while (p < adjusted_endp)
707 {
708 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
709 {
710 while (n--)
711 *to++ = *p++;
712 }
713 else
714 {
715 int c = *p++;
716 c = BYTE8_TO_CHAR (c);
717 to += CHAR_STRING (c, to);
718 }
719 }
720 chars++;
721 }
722 while (p < endp)
723 {
724 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
725 {
726 while (n--)
727 *to++ = *p++;
728 }
729 else
730 {
731 int c = *p++;
732 c = BYTE8_TO_CHAR (c);
733 to += CHAR_STRING (c, to);
734 }
735 chars++;
736 }
737 if (nchars)
738 *nchars = chars;
739 return (to - str);
740 }
741
742 /* Parse unibyte string at STR of LEN bytes, and return the number of
743 bytes it may ocupy when converted to multibyte string by
744 `str_to_multibyte'. */
745
746 int
747 parse_str_to_multibyte (str, len)
748 unsigned char *str;
749 int len;
750 {
751 unsigned char *endp = str + len;
752 int bytes;
753
754 for (bytes = 0; str < endp; str++)
755 bytes += (*str < 0x80) ? 1 : 2;
756 return bytes;
757 }
758
759
760 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
761 that contains the same single-byte characters. It actually
762 converts all 8-bit characters to multibyte forms. It is assured
763 that we can use LEN bytes at STR as a work area and that is
764 enough. */
765
766 int
767 str_to_multibyte (str, len, bytes)
768 unsigned char *str;
769 int len, bytes;
770 {
771 unsigned char *p = str, *endp = str + bytes;
772 unsigned char *to;
773
774 while (p < endp && *p < 0x80) p++;
775 if (p == endp)
776 return bytes;
777 to = p;
778 bytes = endp - p;
779 endp = str + len;
780 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
781 p = endp - bytes;
782 while (p < endp)
783 {
784 int c = *p++;
785
786 if (c >= 0x80)
787 c = BYTE8_TO_CHAR (c);
788 to += CHAR_STRING (c, to);
789 }
790 return (to - str);
791 }
792
793 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
794 actually converts characters in the range 0x80..0xFF to
795 unibyte. */
796
797 int
798 str_as_unibyte (str, bytes)
799 unsigned char *str;
800 int bytes;
801 {
802 const unsigned char *p = str, *endp = str + bytes;
803 unsigned char *to;
804 int c, len;
805
806 while (p < endp)
807 {
808 c = *p;
809 len = BYTES_BY_CHAR_HEAD (c);
810 if (CHAR_BYTE8_HEAD_P (c))
811 break;
812 p += len;
813 }
814 to = str + (p - str);
815 while (p < endp)
816 {
817 c = *p;
818 len = BYTES_BY_CHAR_HEAD (c);
819 if (CHAR_BYTE8_HEAD_P (c))
820 {
821 c = STRING_CHAR_ADVANCE (p);
822 *to++ = CHAR_TO_BYTE8 (c);
823 }
824 else
825 {
826 while (len--) *to++ = *p++;
827 }
828 }
829 return (to - str);
830 }
831
832 int
833 string_count_byte8 (string)
834 Lisp_Object string;
835 {
836 int multibyte = STRING_MULTIBYTE (string);
837 int nbytes = SBYTES (string);
838 unsigned char *p = SDATA (string);
839 unsigned char *pend = p + nbytes;
840 int count = 0;
841 int c, len;
842
843 if (multibyte)
844 while (p < pend)
845 {
846 c = *p;
847 len = BYTES_BY_CHAR_HEAD (c);
848
849 if (CHAR_BYTE8_HEAD_P (c))
850 count++;
851 p += len;
852 }
853 else
854 while (p < pend)
855 {
856 if (*p++ >= 0x80)
857 count++;
858 }
859 return count;
860 }
861
862
863 Lisp_Object
864 string_escape_byte8 (string)
865 Lisp_Object string;
866 {
867 int nchars = SCHARS (string);
868 int nbytes = SBYTES (string);
869 int multibyte = STRING_MULTIBYTE (string);
870 int byte8_count;
871 const unsigned char *src, *src_end;
872 unsigned char *dst;
873 Lisp_Object val;
874 int c, len;
875
876 if (multibyte && nchars == nbytes)
877 return string;
878
879 byte8_count = string_count_byte8 (string);
880
881 if (byte8_count == 0)
882 return string;
883
884 if (multibyte)
885 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
886 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
887 nbytes + byte8_count * 2);
888 else
889 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
890 val = make_uninit_string (nbytes + byte8_count * 3);
891
892 src = SDATA (string);
893 src_end = src + nbytes;
894 dst = SDATA (val);
895 if (multibyte)
896 while (src < src_end)
897 {
898 c = *src;
899 len = BYTES_BY_CHAR_HEAD (c);
900
901 if (CHAR_BYTE8_HEAD_P (c))
902 {
903 c = STRING_CHAR_ADVANCE (src);
904 c = CHAR_TO_BYTE8 (c);
905 sprintf ((char *) dst, "\\%03o", c);
906 dst += 4;
907 }
908 else
909 while (len--) *dst++ = *src++;
910 }
911 else
912 while (src < src_end)
913 {
914 c = *src++;
915 if (c >= 0x80)
916 {
917 sprintf ((char *) dst, "\\%03o", c);
918 dst += 4;
919 }
920 else
921 *dst++ = c;
922 }
923 return val;
924 }
925
926 \f
927 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
928 doc: /*
929 Concatenate all the argument characters and make the result a string.
930 usage: (string &rest CHARACTERS) */)
931 (n, args)
932 int n;
933 Lisp_Object *args;
934 {
935 int i;
936 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
937 unsigned char *p = buf;
938 int c;
939
940 for (i = 0; i < n; i++)
941 {
942 CHECK_CHARACTER (args[i]);
943 c = XINT (args[i]);
944 p += CHAR_STRING (c, p);
945 }
946
947 return make_string_from_bytes ((char *) buf, n, p - buf);
948 }
949
950 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
951 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
952 usage: (unibyte-string &rest BYTES) */)
953 (n, args)
954 int n;
955 Lisp_Object *args;
956 {
957 int i;
958 unsigned char *buf = (unsigned char *) alloca (n);
959 unsigned char *p = buf;
960 unsigned c;
961
962 for (i = 0; i < n; i++)
963 {
964 CHECK_NATNUM (args[i]);
965 c = XFASTINT (args[i]);
966 if (c >= 256)
967 args_out_of_range_3 (args[i], make_number (0), make_number (255));
968 *p++ = c;
969 }
970
971 return make_string_from_bytes ((char *) buf, n, p - buf);
972 }
973
974 DEFUN ("char-resolve-modifers", Fchar_resolve_modifiers,
975 Schar_resolve_modifiers, 1, 1, 0,
976 doc: /* Resolve modifiers in the character CHAR.
977 The value is a character with modifiers resolved into the character
978 code. Unresolved modifiers are kept in the value.
979 usage: (char-resolve-modifers CHAR) */)
980 (character)
981 Lisp_Object character;
982 {
983 int c;
984
985 CHECK_NUMBER (character);
986 c = XINT (character);
987 return make_number (char_resolve_modifier_mask (c));
988 }
989
990 void
991 init_character_once ()
992 {
993 }
994
995 #ifdef emacs
996
997 void
998 syms_of_character ()
999 {
1000 DEFSYM (Qcharacterp, "characterp");
1001 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1002
1003 staticpro (&Vchar_unify_table);
1004 Vchar_unify_table = Qnil;
1005
1006 defsubr (&Smax_char);
1007 defsubr (&Scharacterp);
1008 defsubr (&Sunibyte_char_to_multibyte);
1009 defsubr (&Smultibyte_char_to_unibyte);
1010 defsubr (&Schar_bytes);
1011 defsubr (&Schar_width);
1012 defsubr (&Sstring_width);
1013 defsubr (&Schar_direction);
1014 defsubr (&Sstring);
1015 defsubr (&Sunibyte_string);
1016 defsubr (&Schar_resolve_modifiers);
1017
1018 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1019 doc: /*
1020 Vector recording all translation tables ever defined.
1021 Each element is a pair (SYMBOL . TABLE) relating the table to the
1022 symbol naming it. The ID of a translation table is an index into this vector. */);
1023 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1024
1025 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1026 doc: /*
1027 A char-table for characters which invoke auto-filling.
1028 Such characters have value t in this table. */);
1029 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1030 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1031 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1032
1033 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1034 doc: /*
1035 A char-table for width (columns) of each character. */);
1036 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1037 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1038 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1039 make_number (4));
1040
1041 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1042 doc: /* A char-table for direction of each character. */);
1043 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1044
1045 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1046 doc: /* A char-table for each printable character. */);
1047 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1048 Fset_char_table_range (Vprintable_chars,
1049 Fcons (make_number (32), make_number (126)), Qt);
1050 Fset_char_table_range (Vprintable_chars,
1051 Fcons (make_number (160),
1052 make_number (MAX_5_BYTE_CHAR)), Qt);
1053
1054 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1055 doc: /* Char table of script symbols.
1056 It has one extra slot whose value is a list of script symbols. */);
1057
1058 /* Intern this now in case it isn't already done.
1059 Setting this variable twice is harmless.
1060 But don't staticpro it here--that is done in alloc.c. */
1061 Qchar_table_extra_slots = intern ("char-table-extra-slots");
1062 DEFSYM (Qchar_script_table, "char-script-table");
1063 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1064 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1065
1066 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1067 doc: /* Alist of scripts vs the representative characters. */);
1068 Vscript_representative_chars = Qnil;
1069 }
1070
1071 #endif /* emacs */
1072
1073 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1074 (do not change this comment) */