Merge from mainline.
[bpt/emacs.git] / src / character.c
1 /* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
3 Licensed to the Free Software Foundation.
4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
5 Free Software Foundation, Inc.
6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10 This file is part of GNU Emacs.
11
12 GNU Emacs is free software: you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation, either version 3 of the License, or
15 (at your option) any later version.
16
17 GNU Emacs is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License
23 along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
24
25 /* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28 #ifdef emacs
29 #include <config.h>
30 #endif
31
32 #include <stdio.h>
33
34 #ifdef emacs
35
36 #include <sys/types.h>
37 #include <setjmp.h>
38 #include "lisp.h"
39 #include "character.h"
40 #include "buffer.h"
41 #include "charset.h"
42 #include "composite.h"
43 #include "disptab.h"
44
45 #else /* not emacs */
46
47 #include "mulelib.h"
48
49 #endif /* emacs */
50
51 Lisp_Object Qcharacterp;
52
53 /* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55 Lisp_Object Vtranslation_table_vector;
56
57 /* A char-table for characters which may invoke auto-filling. */
58 Lisp_Object Vauto_fill_chars;
59
60 Lisp_Object Qauto_fill_chars;
61
62 /* Char-table of information about which character to unify to which
63 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
64 Lisp_Object Vchar_unify_table;
65
66 /* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68 Lisp_Object Vprintable_chars;
69
70 /* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72 Lisp_Object Vchar_width_table;
73
74 /* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76 Lisp_Object Vchar_direction_table;
77
78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
79 unsigned char *_fetch_multibyte_char_p;
80
81 /* Char table of scripts. */
82 Lisp_Object Vchar_script_table;
83
84 /* Alist of scripts vs representative characters. */
85 Lisp_Object Vscript_representative_chars;
86
87 static Lisp_Object Qchar_script_table;
88
89 Lisp_Object Vunicode_category_table;
90 \f
91
92 /* If character code C has modifier masks, reflect them to the
93 character code if possible. Return the resulting code. */
94
95 int
96 char_resolve_modifier_mask (c)
97 int c;
98 {
99 /* A non-ASCII character can't reflect modifier bits to the code. */
100 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
101 return c;
102
103 /* For Meta, Shift, and Control modifiers, we need special care. */
104 if (c & CHAR_SHIFT)
105 {
106 /* Shift modifier is valid only with [A-Za-z]. */
107 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
108 c &= ~CHAR_SHIFT;
109 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
110 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
111 /* Shift modifier for control characters and SPC is ignored. */
112 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
113 c &= ~CHAR_SHIFT;
114 }
115 if (c & CHAR_CTL)
116 {
117 /* Simulate the code in lread.c. */
118 /* Allow `\C- ' and `\C-?'. */
119 if ((c & 0377) == ' ')
120 c &= ~0177 & ~ CHAR_CTL;
121 else if ((c & 0377) == '?')
122 c = 0177 | (c & ~0177 & ~CHAR_CTL);
123 /* ASCII control chars are made from letters (both cases),
124 as well as the non-letters within 0100...0137. */
125 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
126 c &= (037 | (~0177 & ~CHAR_CTL));
127 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
128 c &= (037 | (~0177 & ~CHAR_CTL));
129 }
130 #if 0 /* This is outside the scope of this function. (bug#4751) */
131 if (c & CHAR_META)
132 {
133 /* Move the meta bit to the right place for a string. */
134 c = (c & ~CHAR_META) | 0x80;
135 }
136 #endif
137
138 return c;
139 }
140
141
142 /* Store multibyte form of character C at P. If C has modifier bits,
143 handle them appropriately. */
144
145 int
146 char_string (c, p)
147 unsigned c;
148 unsigned char *p;
149 {
150 int bytes;
151
152 if (c & CHAR_MODIFIER_MASK)
153 {
154 c = (unsigned) char_resolve_modifier_mask ((int) c);
155 /* If C still has any modifier bits, just ignore it. */
156 c &= ~CHAR_MODIFIER_MASK;
157 }
158
159 MAYBE_UNIFY_CHAR (c);
160
161 if (c <= MAX_3_BYTE_CHAR)
162 {
163 bytes = CHAR_STRING (c, p);
164 }
165 else if (c <= MAX_4_BYTE_CHAR)
166 {
167 p[0] = (0xF0 | (c >> 18));
168 p[1] = (0x80 | ((c >> 12) & 0x3F));
169 p[2] = (0x80 | ((c >> 6) & 0x3F));
170 p[3] = (0x80 | (c & 0x3F));
171 bytes = 4;
172 }
173 else if (c <= MAX_5_BYTE_CHAR)
174 {
175 p[0] = 0xF8;
176 p[1] = (0x80 | ((c >> 18) & 0x0F));
177 p[2] = (0x80 | ((c >> 12) & 0x3F));
178 p[3] = (0x80 | ((c >> 6) & 0x3F));
179 p[4] = (0x80 | (c & 0x3F));
180 bytes = 5;
181 }
182 else if (c <= MAX_CHAR)
183 {
184 c = CHAR_TO_BYTE8 (c);
185 bytes = BYTE8_STRING (c, p);
186 }
187 else
188 error ("Invalid character: %d", c);
189
190 return bytes;
191 }
192
193
194 /* Return a character whose multibyte form is at P. Set LEN is not
195 NULL, it must be a pointer to integer. In that case, set *LEN to
196 the byte length of the multibyte form. If ADVANCED is not NULL, is
197 must be a pointer to unsigned char. In that case, set *ADVANCED to
198 the ending address (i.e. the starting address of the next
199 character) of the multibyte form. */
200
201 int
202 string_char (p, advanced, len)
203 const unsigned char *p;
204 const unsigned char **advanced;
205 int *len;
206 {
207 int c;
208 const unsigned char *saved_p = p;
209
210 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
211 {
212 c = STRING_CHAR_ADVANCE (p);
213 }
214 else if (! (*p & 0x08))
215 {
216 c = ((((p)[0] & 0xF) << 18)
217 | (((p)[1] & 0x3F) << 12)
218 | (((p)[2] & 0x3F) << 6)
219 | ((p)[3] & 0x3F));
220 p += 4;
221 }
222 else
223 {
224 c = ((((p)[1] & 0x3F) << 18)
225 | (((p)[2] & 0x3F) << 12)
226 | (((p)[3] & 0x3F) << 6)
227 | ((p)[4] & 0x3F));
228 p += 5;
229 }
230
231 MAYBE_UNIFY_CHAR (c);
232
233 if (len)
234 *len = p - saved_p;
235 if (advanced)
236 *advanced = p;
237 return c;
238 }
239
240
241 /* Translate character C by translation table TABLE. If C is
242 negative, translate a character specified by CHARSET and CODE. If
243 no translation is found in TABLE, return the untranslated
244 character. If TABLE is a list, elements are char tables. In this
245 case, translace C by all tables. */
246
247 int
248 translate_char (table, c)
249 Lisp_Object table;
250 int c;
251 {
252 if (CHAR_TABLE_P (table))
253 {
254 Lisp_Object ch;
255
256 ch = CHAR_TABLE_REF (table, c);
257 if (CHARACTERP (ch))
258 c = XINT (ch);
259 }
260 else
261 {
262 for (; CONSP (table); table = XCDR (table))
263 c = translate_char (XCAR (table), c);
264 }
265 return c;
266 }
267
268 /* Convert ASCII or 8-bit character C to unibyte. If C is none of
269 them, return (C & 0xFF).
270
271 The argument REV_TBL is now ignored. It will be removed in the
272 future. */
273
274 int
275 multibyte_char_to_unibyte (c, rev_tbl)
276 int c;
277 Lisp_Object rev_tbl;
278 {
279 if (c < 0x80)
280 return c;
281 if (CHAR_BYTE8_P (c))
282 return CHAR_TO_BYTE8 (c);
283 return (c & 0xFF);
284 }
285
286 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
287 by charset_unibyte. */
288
289 int
290 multibyte_char_to_unibyte_safe (c)
291 int c;
292 {
293 if (c < 0x80)
294 return c;
295 if (CHAR_BYTE8_P (c))
296 return CHAR_TO_BYTE8 (c);
297 return -1;
298 }
299
300 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
301 doc: /* Return non-nil if OBJECT is a character. */)
302 (object, ignore)
303 Lisp_Object object, ignore;
304 {
305 return (CHARACTERP (object) ? Qt : Qnil);
306 }
307
308 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
309 doc: /* Return the character of the maximum code. */)
310 ()
311 {
312 return make_number (MAX_CHAR);
313 }
314
315 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
316 Sunibyte_char_to_multibyte, 1, 1, 0,
317 doc: /* Convert the byte CH to multibyte character. */)
318 (ch)
319 Lisp_Object ch;
320 {
321 int c;
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
325 if (c >= 0x100)
326 error ("Not a unibyte character: %d", c);
327 MAKE_CHAR_MULTIBYTE (c);
328 return make_number (c);
329 }
330
331 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
332 Smultibyte_char_to_unibyte, 1, 1, 0,
333 doc: /* Convert the multibyte character CH to a byte.
334 If the multibyte character does not represent a byte, return -1. */)
335 (ch)
336 Lisp_Object ch;
337 {
338 int cm;
339
340 CHECK_CHARACTER (ch);
341 cm = XFASTINT (ch);
342 if (cm < 256)
343 /* Can't distinguish a byte read from a unibyte buffer from
344 a latin1 char, so let's let it slide. */
345 return ch;
346 else
347 {
348 int cu = CHAR_TO_BYTE_SAFE (cm);
349 return make_number (cu);
350 }
351 }
352
353 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
354 doc: /* Return 1 regardless of the argument CHAR.
355 This is now an obsolete function. We keep it just for backward compatibility.
356 usage: (char-bytes CHAR) */)
357 (ch)
358 Lisp_Object ch;
359 {
360 CHECK_CHARACTER (ch);
361 return make_number (1);
362 }
363
364 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
365 doc: /* Return width of CHAR when displayed in the current buffer.
366 The width is measured by how many columns it occupies on the screen.
367 Tab is taken to occupy `tab-width' columns.
368 usage: (char-width CHAR) */)
369 (ch)
370 Lisp_Object ch;
371 {
372 Lisp_Object disp;
373 int c, width;
374 struct Lisp_Char_Table *dp = buffer_display_table ();
375
376 CHECK_CHARACTER (ch);
377 c = XINT (ch);
378
379 /* Get the way the display table would display it. */
380 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
381
382 if (VECTORP (disp))
383 width = ASIZE (disp);
384 else
385 width = CHAR_WIDTH (c);
386
387 return make_number (width);
388 }
389
390 /* Return width of string STR of length LEN when displayed in the
391 current buffer. The width is measured by how many columns it
392 occupies on the screen. If PRECISION > 0, return the width of
393 longest substring that doesn't exceed PRECISION, and set number of
394 characters and bytes of the substring in *NCHARS and *NBYTES
395 respectively. */
396
397 int
398 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
399 {
400 int i = 0, i_byte = 0;
401 int width = 0;
402 struct Lisp_Char_Table *dp = buffer_display_table ();
403
404 while (i_byte < len)
405 {
406 int bytes, thiswidth;
407 Lisp_Object val;
408 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
409
410 if (dp)
411 {
412 val = DISP_CHAR_VECTOR (dp, c);
413 if (VECTORP (val))
414 thiswidth = XVECTOR (val)->size;
415 else
416 thiswidth = CHAR_WIDTH (c);
417 }
418 else
419 {
420 thiswidth = CHAR_WIDTH (c);
421 }
422
423 if (precision > 0
424 && (width + thiswidth > precision))
425 {
426 *nchars = i;
427 *nbytes = i_byte;
428 return width;
429 }
430 i++;
431 i_byte += bytes;
432 width += thiswidth;
433 }
434
435 if (precision > 0)
436 {
437 *nchars = i;
438 *nbytes = i_byte;
439 }
440
441 return width;
442 }
443
444 /* Return width of string STR of length LEN when displayed in the
445 current buffer. The width is measured by how many columns it
446 occupies on the screen. */
447
448 int
449 strwidth (str, len)
450 unsigned char *str;
451 int len;
452 {
453 return c_string_width (str, len, -1, NULL, NULL);
454 }
455
456 /* Return width of Lisp string STRING when displayed in the current
457 buffer. The width is measured by how many columns it occupies on
458 the screen while paying attention to compositions. If PRECISION >
459 0, return the width of longest substring that doesn't exceed
460 PRECISION, and set number of characters and bytes of the substring
461 in *NCHARS and *NBYTES respectively. */
462
463 int
464 lisp_string_width (string, precision, nchars, nbytes)
465 Lisp_Object string;
466 int precision, *nchars, *nbytes;
467 {
468 int len = SCHARS (string);
469 /* This set multibyte to 0 even if STRING is multibyte when it
470 contains only ascii and eight-bit-graphic, but that's
471 intentional. */
472 int multibyte = len < SBYTES (string);
473 unsigned char *str = SDATA (string);
474 int i = 0, i_byte = 0;
475 int width = 0;
476 struct Lisp_Char_Table *dp = buffer_display_table ();
477
478 while (i < len)
479 {
480 int chars, bytes, thiswidth;
481 Lisp_Object val;
482 int cmp_id;
483 EMACS_INT ignore, end;
484
485 if (find_composition (i, -1, &ignore, &end, &val, string)
486 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
487 >= 0))
488 {
489 thiswidth = composition_table[cmp_id]->width;
490 chars = end - i;
491 bytes = string_char_to_byte (string, end) - i_byte;
492 }
493 else
494 {
495 int c;
496
497 if (multibyte)
498 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
499 else
500 c = str[i_byte], bytes = 1;
501 chars = 1;
502 if (dp)
503 {
504 val = DISP_CHAR_VECTOR (dp, c);
505 if (VECTORP (val))
506 thiswidth = XVECTOR (val)->size;
507 else
508 thiswidth = CHAR_WIDTH (c);
509 }
510 else
511 {
512 thiswidth = CHAR_WIDTH (c);
513 }
514 }
515
516 if (precision > 0
517 && (width + thiswidth > precision))
518 {
519 *nchars = i;
520 *nbytes = i_byte;
521 return width;
522 }
523 i += chars;
524 i_byte += bytes;
525 width += thiswidth;
526 }
527
528 if (precision > 0)
529 {
530 *nchars = i;
531 *nbytes = i_byte;
532 }
533
534 return width;
535 }
536
537 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
538 doc: /* Return width of STRING when displayed in the current buffer.
539 Width is measured by how many columns it occupies on the screen.
540 When calculating width of a multibyte character in STRING,
541 only the base leading-code is considered; the validity of
542 the following bytes is not checked. Tabs in STRING are always
543 taken to occupy `tab-width' columns.
544 usage: (string-width STRING) */)
545 (str)
546 Lisp_Object str;
547 {
548 Lisp_Object val;
549
550 CHECK_STRING (str);
551 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
552 return val;
553 }
554
555 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
556 doc: /* Return the direction of CHAR.
557 The returned value is 0 for left-to-right and 1 for right-to-left.
558 usage: (char-direction CHAR) */)
559 (ch)
560 Lisp_Object ch;
561 {
562 int c;
563
564 CHECK_CHARACTER (ch);
565 c = XINT (ch);
566 return CHAR_TABLE_REF (Vchar_direction_table, c);
567 }
568
569 /* Return the number of characters in the NBYTES bytes at PTR.
570 This works by looking at the contents and checking for multibyte
571 sequences while assuming that there's no invalid sequence.
572 However, if the current buffer has enable-multibyte-characters =
573 nil, we treat each byte as a character. */
574
575 EMACS_INT
576 chars_in_text (ptr, nbytes)
577 const unsigned char *ptr;
578 EMACS_INT nbytes;
579 {
580 /* current_buffer is null at early stages of Emacs initialization. */
581 if (current_buffer == 0
582 || NILP (current_buffer->enable_multibyte_characters))
583 return nbytes;
584
585 return multibyte_chars_in_text (ptr, nbytes);
586 }
587
588 /* Return the number of characters in the NBYTES bytes at PTR.
589 This works by looking at the contents and checking for multibyte
590 sequences while assuming that there's no invalid sequence. It
591 ignores enable-multibyte-characters. */
592
593 EMACS_INT
594 multibyte_chars_in_text (ptr, nbytes)
595 const unsigned char *ptr;
596 EMACS_INT nbytes;
597 {
598 const unsigned char *endp = ptr + nbytes;
599 int chars = 0;
600
601 while (ptr < endp)
602 {
603 int len = MULTIBYTE_LENGTH (ptr, endp);
604
605 if (len == 0)
606 abort ();
607 ptr += len;
608 chars++;
609 }
610
611 return chars;
612 }
613
614 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
615 characters and bytes in it, and store them in *NCHARS and *NBYTES
616 respectively. On counting bytes, pay attention to that 8-bit
617 characters not constructing a valid multibyte sequence are
618 represented by 2-byte in a multibyte text. */
619
620 void
621 parse_str_as_multibyte (str, len, nchars, nbytes)
622 const unsigned char *str;
623 int len, *nchars, *nbytes;
624 {
625 const unsigned char *endp = str + len;
626 int n, chars = 0, bytes = 0;
627
628 if (len >= MAX_MULTIBYTE_LENGTH)
629 {
630 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
631 while (str < adjusted_endp)
632 {
633 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
634 str += n, bytes += n;
635 else
636 str++, bytes += 2;
637 chars++;
638 }
639 }
640 while (str < endp)
641 {
642 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
643 str += n, bytes += n;
644 else
645 str++, bytes += 2;
646 chars++;
647 }
648
649 *nchars = chars;
650 *nbytes = bytes;
651 return;
652 }
653
654 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
655 It actually converts only such 8-bit characters that don't contruct
656 a multibyte sequence to multibyte forms of Latin-1 characters. If
657 NCHARS is nonzero, set *NCHARS to the number of characters in the
658 text. It is assured that we can use LEN bytes at STR as a work
659 area and that is enough. Return the number of bytes of the
660 resulting text. */
661
662 int
663 str_as_multibyte (str, len, nbytes, nchars)
664 unsigned char *str;
665 int len, nbytes, *nchars;
666 {
667 unsigned char *p = str, *endp = str + nbytes;
668 unsigned char *to;
669 int chars = 0;
670 int n;
671
672 if (nbytes >= MAX_MULTIBYTE_LENGTH)
673 {
674 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
675 while (p < adjusted_endp
676 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
677 p += n, chars++;
678 }
679 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
680 p += n, chars++;
681 if (nchars)
682 *nchars = chars;
683 if (p == endp)
684 return nbytes;
685
686 to = p;
687 nbytes = endp - p;
688 endp = str + len;
689 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
690 p = endp - nbytes;
691
692 if (nbytes >= MAX_MULTIBYTE_LENGTH)
693 {
694 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
695 while (p < adjusted_endp)
696 {
697 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
698 {
699 while (n--)
700 *to++ = *p++;
701 }
702 else
703 {
704 int c = *p++;
705 c = BYTE8_TO_CHAR (c);
706 to += CHAR_STRING (c, to);
707 }
708 }
709 chars++;
710 }
711 while (p < endp)
712 {
713 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
714 {
715 while (n--)
716 *to++ = *p++;
717 }
718 else
719 {
720 int c = *p++;
721 c = BYTE8_TO_CHAR (c);
722 to += CHAR_STRING (c, to);
723 }
724 chars++;
725 }
726 if (nchars)
727 *nchars = chars;
728 return (to - str);
729 }
730
731 /* Parse unibyte string at STR of LEN bytes, and return the number of
732 bytes it may ocupy when converted to multibyte string by
733 `str_to_multibyte'. */
734
735 int
736 parse_str_to_multibyte (str, len)
737 unsigned char *str;
738 int len;
739 {
740 unsigned char *endp = str + len;
741 int bytes;
742
743 for (bytes = 0; str < endp; str++)
744 bytes += (*str < 0x80) ? 1 : 2;
745 return bytes;
746 }
747
748
749 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
750 that contains the same single-byte characters. It actually
751 converts all 8-bit characters to multibyte forms. It is assured
752 that we can use LEN bytes at STR as a work area and that is
753 enough. */
754
755 int
756 str_to_multibyte (str, len, bytes)
757 unsigned char *str;
758 int len, bytes;
759 {
760 unsigned char *p = str, *endp = str + bytes;
761 unsigned char *to;
762
763 while (p < endp && *p < 0x80) p++;
764 if (p == endp)
765 return bytes;
766 to = p;
767 bytes = endp - p;
768 endp = str + len;
769 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
770 p = endp - bytes;
771 while (p < endp)
772 {
773 int c = *p++;
774
775 if (c >= 0x80)
776 c = BYTE8_TO_CHAR (c);
777 to += CHAR_STRING (c, to);
778 }
779 return (to - str);
780 }
781
782 /* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
783 actually converts characters in the range 0x80..0xFF to
784 unibyte. */
785
786 int
787 str_as_unibyte (str, bytes)
788 unsigned char *str;
789 int bytes;
790 {
791 const unsigned char *p = str, *endp = str + bytes;
792 unsigned char *to;
793 int c, len;
794
795 while (p < endp)
796 {
797 c = *p;
798 len = BYTES_BY_CHAR_HEAD (c);
799 if (CHAR_BYTE8_HEAD_P (c))
800 break;
801 p += len;
802 }
803 to = str + (p - str);
804 while (p < endp)
805 {
806 c = *p;
807 len = BYTES_BY_CHAR_HEAD (c);
808 if (CHAR_BYTE8_HEAD_P (c))
809 {
810 c = STRING_CHAR_ADVANCE (p);
811 *to++ = CHAR_TO_BYTE8 (c);
812 }
813 else
814 {
815 while (len--) *to++ = *p++;
816 }
817 }
818 return (to - str);
819 }
820
821 /* Convert eight-bit chars in SRC (in multibyte form) to the
822 corresponding byte and store in DST. CHARS is the number of
823 characters in SRC. The value is the number of bytes stored in DST.
824 Usually, the value is the same as CHARS, but is less than it if SRC
825 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
826 is nonzero, a Latin-1 character is accepted and converted to a byte
827 of that character code.
828 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
829
830 EMACS_INT
831 str_to_unibyte (src, dst, chars, accept_latin_1)
832 const unsigned char *src;
833 unsigned char *dst;
834 EMACS_INT chars;
835 int accept_latin_1;
836 {
837 EMACS_INT i;
838
839 for (i = 0; i < chars; i++)
840 {
841 int c = STRING_CHAR_ADVANCE (src);
842
843 if (CHAR_BYTE8_P (c))
844 c = CHAR_TO_BYTE8 (c);
845 else if (! ASCII_CHAR_P (c)
846 && (! accept_latin_1 || c >= 0x100))
847 return i;
848 *dst++ = c;
849 }
850 return i;
851 }
852
853
854 int
855 string_count_byte8 (string)
856 Lisp_Object string;
857 {
858 int multibyte = STRING_MULTIBYTE (string);
859 int nbytes = SBYTES (string);
860 unsigned char *p = SDATA (string);
861 unsigned char *pend = p + nbytes;
862 int count = 0;
863 int c, len;
864
865 if (multibyte)
866 while (p < pend)
867 {
868 c = *p;
869 len = BYTES_BY_CHAR_HEAD (c);
870
871 if (CHAR_BYTE8_HEAD_P (c))
872 count++;
873 p += len;
874 }
875 else
876 while (p < pend)
877 {
878 if (*p++ >= 0x80)
879 count++;
880 }
881 return count;
882 }
883
884
885 Lisp_Object
886 string_escape_byte8 (string)
887 Lisp_Object string;
888 {
889 int nchars = SCHARS (string);
890 int nbytes = SBYTES (string);
891 int multibyte = STRING_MULTIBYTE (string);
892 int byte8_count;
893 const unsigned char *src, *src_end;
894 unsigned char *dst;
895 Lisp_Object val;
896 int c, len;
897
898 if (multibyte && nchars == nbytes)
899 return string;
900
901 byte8_count = string_count_byte8 (string);
902
903 if (byte8_count == 0)
904 return string;
905
906 if (multibyte)
907 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
908 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
909 nbytes + byte8_count * 2);
910 else
911 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
912 val = make_uninit_string (nbytes + byte8_count * 3);
913
914 src = SDATA (string);
915 src_end = src + nbytes;
916 dst = SDATA (val);
917 if (multibyte)
918 while (src < src_end)
919 {
920 c = *src;
921 len = BYTES_BY_CHAR_HEAD (c);
922
923 if (CHAR_BYTE8_HEAD_P (c))
924 {
925 c = STRING_CHAR_ADVANCE (src);
926 c = CHAR_TO_BYTE8 (c);
927 sprintf ((char *) dst, "\\%03o", c);
928 dst += 4;
929 }
930 else
931 while (len--) *dst++ = *src++;
932 }
933 else
934 while (src < src_end)
935 {
936 c = *src++;
937 if (c >= 0x80)
938 {
939 sprintf ((char *) dst, "\\%03o", c);
940 dst += 4;
941 }
942 else
943 *dst++ = c;
944 }
945 return val;
946 }
947
948 \f
949 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
950 doc: /*
951 Concatenate all the argument characters and make the result a string.
952 usage: (string &rest CHARACTERS) */)
953 (n, args)
954 int n;
955 Lisp_Object *args;
956 {
957 int i;
958 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
959 unsigned char *p = buf;
960 int c;
961
962 for (i = 0; i < n; i++)
963 {
964 CHECK_CHARACTER (args[i]);
965 c = XINT (args[i]);
966 p += CHAR_STRING (c, p);
967 }
968
969 return make_string_from_bytes ((char *) buf, n, p - buf);
970 }
971
972 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
973 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
974 usage: (unibyte-string &rest BYTES) */)
975 (n, args)
976 int n;
977 Lisp_Object *args;
978 {
979 int i;
980 unsigned char *buf = (unsigned char *) alloca (n);
981 unsigned char *p = buf;
982 unsigned c;
983
984 for (i = 0; i < n; i++)
985 {
986 CHECK_NATNUM (args[i]);
987 c = XFASTINT (args[i]);
988 if (c >= 256)
989 args_out_of_range_3 (args[i], make_number (0), make_number (255));
990 *p++ = c;
991 }
992
993 return make_string_from_bytes ((char *) buf, n, p - buf);
994 }
995
996 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
997 Schar_resolve_modifiers, 1, 1, 0,
998 doc: /* Resolve modifiers in the character CHAR.
999 The value is a character with modifiers resolved into the character
1000 code. Unresolved modifiers are kept in the value.
1001 usage: (char-resolve-modifiers CHAR) */)
1002 (character)
1003 Lisp_Object character;
1004 {
1005 int c;
1006
1007 CHECK_NUMBER (character);
1008 c = XINT (character);
1009 return make_number (char_resolve_modifier_mask (c));
1010 }
1011
1012 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1013 doc: /* Return a byte value of a character at point.
1014 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1015 a byte value.
1016 Optional 2nd arg STRING, if non-nil, is a string of which first
1017 character is a target to get a byte value. In this case, POSITION, if
1018 non-nil, is an index of a target character in the string.
1019
1020 If the current buffer (or STRING) is multibyte, and the target
1021 character is not ASCII nor 8-bit character, an error is signalled. */)
1022 (position, string)
1023 Lisp_Object position, string;
1024 {
1025 int c;
1026 EMACS_INT pos;
1027 unsigned char *p;
1028
1029 if (NILP (string))
1030 {
1031 if (NILP (position))
1032 {
1033 p = PT_ADDR;
1034 }
1035 else
1036 {
1037 CHECK_NUMBER_COERCE_MARKER (position);
1038 if (XINT (position) < BEGV || XINT (position) >= ZV)
1039 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1040 pos = XFASTINT (position);
1041 p = CHAR_POS_ADDR (pos);
1042 }
1043 if (NILP (current_buffer->enable_multibyte_characters))
1044 return make_number (*p);
1045 }
1046 else
1047 {
1048 CHECK_STRING (string);
1049 if (NILP (position))
1050 {
1051 p = SDATA (string);
1052 }
1053 else
1054 {
1055 CHECK_NATNUM (position);
1056 if (XINT (position) >= SCHARS (string))
1057 args_out_of_range (string, position);
1058 pos = XFASTINT (position);
1059 p = SDATA (string) + string_char_to_byte (string, pos);
1060 }
1061 if (! STRING_MULTIBYTE (string))
1062 return make_number (*p);
1063 }
1064 c = STRING_CHAR (p);
1065 if (CHAR_BYTE8_P (c))
1066 c = CHAR_TO_BYTE8 (c);
1067 else if (! ASCII_CHAR_P (c))
1068 error ("Not an ASCII nor an 8-bit character: %d", c);
1069 return make_number (c);
1070 }
1071
1072
1073 void
1074 init_character_once ()
1075 {
1076 }
1077
1078 #ifdef emacs
1079
1080 void
1081 syms_of_character ()
1082 {
1083 DEFSYM (Qcharacterp, "characterp");
1084 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1085
1086 staticpro (&Vchar_unify_table);
1087 Vchar_unify_table = Qnil;
1088
1089 defsubr (&Smax_char);
1090 defsubr (&Scharacterp);
1091 defsubr (&Sunibyte_char_to_multibyte);
1092 defsubr (&Smultibyte_char_to_unibyte);
1093 defsubr (&Schar_bytes);
1094 defsubr (&Schar_width);
1095 defsubr (&Sstring_width);
1096 defsubr (&Schar_direction);
1097 defsubr (&Sstring);
1098 defsubr (&Sunibyte_string);
1099 defsubr (&Schar_resolve_modifiers);
1100 defsubr (&Sget_byte);
1101
1102 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1103 doc: /*
1104 Vector recording all translation tables ever defined.
1105 Each element is a pair (SYMBOL . TABLE) relating the table to the
1106 symbol naming it. The ID of a translation table is an index into this vector. */);
1107 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1108
1109 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1110 doc: /*
1111 A char-table for characters which invoke auto-filling.
1112 Such characters have value t in this table. */);
1113 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1114 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1115 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1116
1117 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1118 doc: /*
1119 A char-table for width (columns) of each character. */);
1120 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1121 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1122 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1123 make_number (4));
1124
1125 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1126 doc: /* A char-table for direction of each character. */);
1127 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1128
1129 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1130 doc: /* A char-table for each printable character. */);
1131 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1132 Fset_char_table_range (Vprintable_chars,
1133 Fcons (make_number (32), make_number (126)), Qt);
1134 Fset_char_table_range (Vprintable_chars,
1135 Fcons (make_number (160),
1136 make_number (MAX_5_BYTE_CHAR)), Qt);
1137
1138 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1139 doc: /* Char table of script symbols.
1140 It has one extra slot whose value is a list of script symbols. */);
1141
1142 /* Intern this now in case it isn't already done.
1143 Setting this variable twice is harmless.
1144 But don't staticpro it here--that is done in alloc.c. */
1145 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1146 DEFSYM (Qchar_script_table, "char-script-table");
1147 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1148 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1149
1150 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1151 doc: /* Alist of scripts vs the representative characters.
1152 Each element is a cons (SCRIPT . CHARS).
1153 SCRIPT is a symbol representing a script or a subgroup of a script.
1154 CHARS is a list or a vector of characters.
1155 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1156 If it is a vector, one of the characters in the vector is necessary.
1157 This variable is used to find a font for a specific script. */);
1158 Vscript_representative_chars = Qnil;
1159
1160 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1161 doc: /* Char table of Unicode's "General Category".
1162 All Unicode characters have one of the following values (symbol):
1163 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1164 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1165 See The Unicode Standard for the meaning of those values. */);
1166 /* The correct char-table is setup in characters.el. */
1167 Vunicode_category_table = Qnil;
1168 }
1169
1170 #endif /* emacs */
1171
1172 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1173 (do not change this comment) */