from trunk
[bpt/emacs.git] / src / character.c
CommitLineData
0168c3d8
KH
1/* Basic character support.
2 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
8f924df7 3 Licensed to the Free Software Foundation.
114f9c96 4 Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
ec62e0ac 5 Free Software Foundation, Inc.
114f9c96 6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
0168c3d8
KH
7 National Institute of Advanced Industrial Science and Technology (AIST)
8 Registration Number H13PRO009
9
10This file is part of GNU Emacs.
11
9ec0b715 12GNU Emacs is free software: you can redistribute it and/or modify
0168c3d8 13it under the terms of the GNU General Public License as published by
9ec0b715
GM
14the Free Software Foundation, either version 3 of the License, or
15(at your option) any later version.
0168c3d8
KH
16
17GNU Emacs is distributed in the hope that it will be useful,
18but WITHOUT ANY WARRANTY; without even the implied warranty of
19MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20GNU General Public License for more details.
21
22You should have received a copy of the GNU General Public License
9ec0b715 23along with GNU Emacs. If not, see <http://www.gnu.org/licenses/>. */
0168c3d8
KH
24
25/* At first, see the document in `character.h' to understand the code
26 in this file. */
27
28#ifdef emacs
29#include <config.h>
30#endif
31
32#include <stdio.h>
33
34#ifdef emacs
35
36#include <sys/types.h>
d7306fe6 37#include <setjmp.h>
0168c3d8
KH
38#include "lisp.h"
39#include "character.h"
40#include "buffer.h"
41#include "charset.h"
42#include "composite.h"
43#include "disptab.h"
44
45#else /* not emacs */
46
47#include "mulelib.h"
48
49#endif /* emacs */
50
51Lisp_Object Qcharacterp;
52
53/* Vector of translation table ever defined.
54 ID of a translation table is used to index this vector. */
55Lisp_Object Vtranslation_table_vector;
56
57/* A char-table for characters which may invoke auto-filling. */
58Lisp_Object Vauto_fill_chars;
59
60Lisp_Object Qauto_fill_chars;
61
33f91981 62/* Char-table of information about which character to unify to which
6f1b43a0 63 Unicode character. Mainly used by the macro MAYBE_UNIFY_CHAR. */
0168c3d8
KH
64Lisp_Object Vchar_unify_table;
65
66/* A char-table. An element is non-nil iff the corresponding
67 character has a printable glyph. */
68Lisp_Object Vprintable_chars;
69
70/* A char-table. An elemnent is a column-width of the corresponding
71 character. */
72Lisp_Object Vchar_width_table;
73
74/* A char-table. An element is a symbol indicating the direction
75 property of corresponding character. */
76Lisp_Object Vchar_direction_table;
77
8973478b 78/* Variable used locally in the macro FETCH_MULTIBYTE_CHAR. */
0168c3d8 79unsigned char *_fetch_multibyte_char_p;
0168c3d8 80
c57f3328
KH
81/* Char table of scripts. */
82Lisp_Object Vchar_script_table;
83
c7e14352
KH
84/* Alist of scripts vs representative characters. */
85Lisp_Object Vscript_representative_chars;
86
c57f3328
KH
87static Lisp_Object Qchar_script_table;
88
a3cbb631 89Lisp_Object Vunicode_category_table;
0168c3d8
KH
90\f
91
2bde7652
KH
92/* If character code C has modifier masks, reflect them to the
93 character code if possible. Return the resulting code. */
94
95int
96char_resolve_modifier_mask (c)
97 int c;
98{
d0363d44 99 /* A non-ASCII character can't reflect modifier bits to the code. */
2bde7652
KH
100 if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
101 return c;
102
103 /* For Meta, Shift, and Control modifiers, we need special care. */
2bde7652
KH
104 if (c & CHAR_SHIFT)
105 {
106 /* Shift modifier is valid only with [A-Za-z]. */
107 if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
108 c &= ~CHAR_SHIFT;
109 else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
110 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
03365d0e
KH
111 /* Shift modifier for control characters and SPC is ignored. */
112 else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
d0363d44
KH
113 c &= ~CHAR_SHIFT;
114 }
2bde7652
KH
115 if (c & CHAR_CTL)
116 {
117 /* Simulate the code in lread.c. */
118 /* Allow `\C- ' and `\C-?'. */
03365d0e
KH
119 if ((c & 0377) == ' ')
120 c &= ~0177 & ~ CHAR_CTL;
121 else if ((c & 0377) == '?')
122 c = 0177 | (c & ~0177 & ~CHAR_CTL);
2bde7652
KH
123 /* ASCII control chars are made from letters (both cases),
124 as well as the non-letters within 0100...0137. */
125 else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
126 c &= (037 | (~0177 & ~CHAR_CTL));
127 else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
128 c &= (037 | (~0177 & ~CHAR_CTL));
129 }
e9c1637d 130#if 0 /* This is outside the scope of this function. (bug#4751) */
03365d0e
KH
131 if (c & CHAR_META)
132 {
133 /* Move the meta bit to the right place for a string. */
134 c = (c & ~CHAR_META) | 0x80;
135 }
e9c1637d 136#endif
2bde7652
KH
137
138 return c;
139}
140
141
33f91981
KH
142/* Store multibyte form of character C at P. If C has modifier bits,
143 handle them appropriately. */
144
0168c3d8 145int
e3d8eb8c 146char_string (c, p)
5aa91c9b 147 unsigned c;
1889b238 148 unsigned char *p;
0168c3d8
KH
149{
150 int bytes;
151
e3d8eb8c
KH
152 if (c & CHAR_MODIFIER_MASK)
153 {
2bde7652 154 c = (unsigned) char_resolve_modifier_mask ((int) c);
e3d8eb8c
KH
155 /* If C still has any modifier bits, just ignore it. */
156 c &= ~CHAR_MODIFIER_MASK;
157 }
158
0168c3d8
KH
159 MAYBE_UNIFY_CHAR (c);
160
e3d8eb8c 161 if (c <= MAX_3_BYTE_CHAR)
0168c3d8
KH
162 {
163 bytes = CHAR_STRING (c, p);
164 }
165 else if (c <= MAX_4_BYTE_CHAR)
166 {
167 p[0] = (0xF0 | (c >> 18));
168 p[1] = (0x80 | ((c >> 12) & 0x3F));
169 p[2] = (0x80 | ((c >> 6) & 0x3F));
170 p[3] = (0x80 | (c & 0x3F));
171 bytes = 4;
172 }
e3d8eb8c 173 else if (c <= MAX_5_BYTE_CHAR)
0168c3d8
KH
174 {
175 p[0] = 0xF8;
176 p[1] = (0x80 | ((c >> 18) & 0x0F));
177 p[2] = (0x80 | ((c >> 12) & 0x3F));
178 p[3] = (0x80 | ((c >> 6) & 0x3F));
179 p[4] = (0x80 | (c & 0x3F));
180 bytes = 5;
181 }
5aa91c9b 182 else if (c <= MAX_CHAR)
e3d8eb8c
KH
183 {
184 c = CHAR_TO_BYTE8 (c);
185 bytes = BYTE8_STRING (c, p);
186 }
5aa91c9b
KH
187 else
188 error ("Invalid character: %d", c);
1889b238 189
0168c3d8
KH
190 return bytes;
191}
192
193
33f91981
KH
194/* Return a character whose multibyte form is at P. Set LEN is not
195 NULL, it must be a pointer to integer. In that case, set *LEN to
196 the byte length of the multibyte form. If ADVANCED is not NULL, is
197 must be a pointer to unsigned char. In that case, set *ADVANCED to
198 the ending address (i.e. the starting address of the next
199 character) of the multibyte form. */
200
0168c3d8 201int
e3d8eb8c 202string_char (p, advanced, len)
15843e6f
KH
203 const unsigned char *p;
204 const unsigned char **advanced;
0168c3d8
KH
205 int *len;
206{
1889b238 207 int c;
15843e6f 208 const unsigned char *saved_p = p;
0168c3d8
KH
209
210 if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
211 {
212 c = STRING_CHAR_ADVANCE (p);
213 }
214 else if (! (*p & 0x08))
215 {
216 c = ((((p)[0] & 0xF) << 18)
217 | (((p)[1] & 0x3F) << 12)
218 | (((p)[2] & 0x3F) << 6)
219 | ((p)[3] & 0x3F));
220 p += 4;
221 }
222 else
223 {
224 c = ((((p)[1] & 0x3F) << 18)
225 | (((p)[2] & 0x3F) << 12)
226 | (((p)[3] & 0x3F) << 6)
227 | ((p)[4] & 0x3F));
228 p += 5;
229 }
230
231 MAYBE_UNIFY_CHAR (c);
232
233 if (len)
234 *len = p - saved_p;
235 if (advanced)
236 *advanced = p;
237 return c;
238}
239
240
241/* Translate character C by translation table TABLE. If C is
242 negative, translate a character specified by CHARSET and CODE. If
243 no translation is found in TABLE, return the untranslated
10453be9
KH
244 character. If TABLE is a list, elements are char tables. In this
245 case, translace C by all tables. */
0168c3d8
KH
246
247int
248translate_char (table, c)
249 Lisp_Object table;
250 int c;
251{
10453be9
KH
252 if (CHAR_TABLE_P (table))
253 {
254 Lisp_Object ch;
255
256 ch = CHAR_TABLE_REF (table, c);
257 if (CHARACTERP (ch))
258 c = XINT (ch);
259 }
260 else
261 {
262 for (; CONSP (table); table = XCDR (table))
263 c = translate_char (XCAR (table), c);
264 }
265 return c;
0168c3d8
KH
266}
267
2e5db15c
KH
268/* Convert ASCII or 8-bit character C to unibyte. If C is none of
269 them, return (C & 0xFF).
0168c3d8
KH
270
271 The argument REV_TBL is now ignored. It will be removed in the
272 future. */
273
274int
275multibyte_char_to_unibyte (c, rev_tbl)
276 int c;
277 Lisp_Object rev_tbl;
278{
2e5db15c
KH
279 if (c < 0x80)
280 return c;
b672c5ae
KH
281 if (CHAR_BYTE8_P (c))
282 return CHAR_TO_BYTE8 (c);
2e5db15c 283 return (c & 0xFF);
0168c3d8
KH
284}
285
935d5b02
KH
286/* Like multibyte_char_to_unibyte, but return -1 if C is not supported
287 by charset_unibyte. */
288
289int
290multibyte_char_to_unibyte_safe (c)
291 int c;
292{
2e5db15c
KH
293 if (c < 0x80)
294 return c;
935d5b02
KH
295 if (CHAR_BYTE8_P (c))
296 return CHAR_TO_BYTE8 (c);
2e5db15c 297 return -1;
935d5b02 298}
0168c3d8
KH
299
300DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
301 doc: /* Return non-nil if OBJECT is a character. */)
302 (object, ignore)
303 Lisp_Object object, ignore;
304{
305 return (CHARACTERP (object) ? Qt : Qnil);
306}
307
308DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
309 doc: /* Return the character of the maximum code. */)
310 ()
311{
312 return make_number (MAX_CHAR);
313}
314
315DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
316 Sunibyte_char_to_multibyte, 1, 1, 0,
5556875b 317 doc: /* Convert the byte CH to multibyte character. */)
0168c3d8
KH
318 (ch)
319 Lisp_Object ch;
320{
321 int c;
0168c3d8
KH
322
323 CHECK_CHARACTER (ch);
324 c = XFASTINT (ch);
2e5db15c
KH
325 if (c >= 0x100)
326 error ("Not a unibyte character: %d", c);
4c0354d7 327 MAKE_CHAR_MULTIBYTE (c);
0168c3d8
KH
328 return make_number (c);
329}
330
331DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
332 Smultibyte_char_to_unibyte, 1, 1, 0,
5556875b
SM
333 doc: /* Convert the multibyte character CH to a byte.
334If the multibyte character does not represent a byte, return -1. */)
0168c3d8
KH
335 (ch)
336 Lisp_Object ch;
337{
5556875b 338 int cm;
0168c3d8
KH
339
340 CHECK_CHARACTER (ch);
5556875b
SM
341 cm = XFASTINT (ch);
342 if (cm < 256)
343 /* Can't distinguish a byte read from a unibyte buffer from
344 a latin1 char, so let's let it slide. */
345 return ch;
346 else
347 {
2afc21f5 348 int cu = CHAR_TO_BYTE_SAFE (cm);
5556875b
SM
349 return make_number (cu);
350 }
0168c3d8
KH
351}
352
353DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
354 doc: /* Return 1 regardless of the argument CHAR.
add553ac
JB
355This is now an obsolete function. We keep it just for backward compatibility.
356usage: (char-bytes CHAR) */)
0168c3d8
KH
357 (ch)
358 Lisp_Object ch;
359{
360 CHECK_CHARACTER (ch);
361 return make_number (1);
362}
363
364DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
365 doc: /* Return width of CHAR when displayed in the current buffer.
366The width is measured by how many columns it occupies on the screen.
add553ac
JB
367Tab is taken to occupy `tab-width' columns.
368usage: (char-width CHAR) */)
0168c3d8
KH
369 (ch)
370 Lisp_Object ch;
371{
372 Lisp_Object disp;
373 int c, width;
374 struct Lisp_Char_Table *dp = buffer_display_table ();
375
376 CHECK_CHARACTER (ch);
377 c = XINT (ch);
378
379 /* Get the way the display table would display it. */
380 disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
381
382 if (VECTORP (disp))
383 width = ASIZE (disp);
384 else
385 width = CHAR_WIDTH (c);
386
387 return make_number (width);
388}
389
0168c3d8
KH
390/* Return width of string STR of length LEN when displayed in the
391 current buffer. The width is measured by how many columns it
392 occupies on the screen. If PRECISION > 0, return the width of
393 longest substring that doesn't exceed PRECISION, and set number of
394 characters and bytes of the substring in *NCHARS and *NBYTES
395 respectively. */
396
1889b238 397int
63c125ab 398c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
0168c3d8
KH
399{
400 int i = 0, i_byte = 0;
401 int width = 0;
402 struct Lisp_Char_Table *dp = buffer_display_table ();
403
404 while (i_byte < len)
405 {
406 int bytes, thiswidth;
407 Lisp_Object val;
62a6e103 408 int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0168c3d8
KH
409
410 if (dp)
411 {
412 val = DISP_CHAR_VECTOR (dp, c);
413 if (VECTORP (val))
414 thiswidth = XVECTOR (val)->size;
415 else
416 thiswidth = CHAR_WIDTH (c);
417 }
418 else
419 {
420 thiswidth = CHAR_WIDTH (c);
421 }
422
423 if (precision > 0
424 && (width + thiswidth > precision))
425 {
426 *nchars = i;
427 *nbytes = i_byte;
428 return width;
429 }
430 i++;
431 i_byte += bytes;
432 width += thiswidth;
433 }
434
435 if (precision > 0)
436 {
437 *nchars = i;
438 *nbytes = i_byte;
439 }
440
441 return width;
442}
443
1889b238
KH
444/* Return width of string STR of length LEN when displayed in the
445 current buffer. The width is measured by how many columns it
446 occupies on the screen. */
447
448int
449strwidth (str, len)
450 unsigned char *str;
451 int len;
452{
453 return c_string_width (str, len, -1, NULL, NULL);
454}
455
0168c3d8
KH
456/* Return width of Lisp string STRING when displayed in the current
457 buffer. The width is measured by how many columns it occupies on
458 the screen while paying attention to compositions. If PRECISION >
459 0, return the width of longest substring that doesn't exceed
460 PRECISION, and set number of characters and bytes of the substring
461 in *NCHARS and *NBYTES respectively. */
462
463int
464lisp_string_width (string, precision, nchars, nbytes)
465 Lisp_Object string;
466 int precision, *nchars, *nbytes;
467{
8f924df7 468 int len = SCHARS (string);
0aee65b9
KH
469 /* This set multibyte to 0 even if STRING is multibyte when it
470 contains only ascii and eight-bit-graphic, but that's
471 intentional. */
472 int multibyte = len < SBYTES (string);
8f924df7 473 unsigned char *str = SDATA (string);
0168c3d8
KH
474 int i = 0, i_byte = 0;
475 int width = 0;
476 struct Lisp_Char_Table *dp = buffer_display_table ();
477
478 while (i < len)
479 {
480 int chars, bytes, thiswidth;
481 Lisp_Object val;
482 int cmp_id;
f4bc0685 483 EMACS_INT ignore, end;
0168c3d8
KH
484
485 if (find_composition (i, -1, &ignore, &end, &val, string)
486 && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
487 >= 0))
488 {
489 thiswidth = composition_table[cmp_id]->width;
490 chars = end - i;
491 bytes = string_char_to_byte (string, end) - i_byte;
492 }
0168c3d8
KH
493 else
494 {
0aee65b9 495 int c;
0168c3d8 496
0aee65b9 497 if (multibyte)
62a6e103 498 c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
0aee65b9
KH
499 else
500 c = str[i_byte], bytes = 1;
0168c3d8 501 chars = 1;
0aee65b9
KH
502 if (dp)
503 {
504 val = DISP_CHAR_VECTOR (dp, c);
505 if (VECTORP (val))
506 thiswidth = XVECTOR (val)->size;
507 else
508 thiswidth = CHAR_WIDTH (c);
509 }
510 else
511 {
512 thiswidth = CHAR_WIDTH (c);
513 }
0168c3d8
KH
514 }
515
516 if (precision > 0
517 && (width + thiswidth > precision))
518 {
519 *nchars = i;
520 *nbytes = i_byte;
521 return width;
522 }
523 i += chars;
524 i_byte += bytes;
525 width += thiswidth;
526 }
527
528 if (precision > 0)
529 {
530 *nchars = i;
531 *nbytes = i_byte;
532 }
533
534 return width;
535}
536
537DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
538 doc: /* Return width of STRING when displayed in the current buffer.
539Width is measured by how many columns it occupies on the screen.
540When calculating width of a multibyte character in STRING,
541only the base leading-code is considered; the validity of
542the following bytes is not checked. Tabs in STRING are always
add553ac
JB
543taken to occupy `tab-width' columns.
544usage: (string-width STRING) */)
0168c3d8
KH
545 (str)
546 Lisp_Object str;
547{
548 Lisp_Object val;
549
550 CHECK_STRING (str);
551 XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
552 return val;
553}
554
555DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
556 doc: /* Return the direction of CHAR.
add553ac
JB
557The returned value is 0 for left-to-right and 1 for right-to-left.
558usage: (char-direction CHAR) */)
0168c3d8
KH
559 (ch)
560 Lisp_Object ch;
561{
562 int c;
563
564 CHECK_CHARACTER (ch);
565 c = XINT (ch);
566 return CHAR_TABLE_REF (Vchar_direction_table, c);
567}
568
0168c3d8
KH
569/* Return the number of characters in the NBYTES bytes at PTR.
570 This works by looking at the contents and checking for multibyte
571 sequences while assuming that there's no invalid sequence.
572 However, if the current buffer has enable-multibyte-characters =
573 nil, we treat each byte as a character. */
574
13818c30 575EMACS_INT
0168c3d8 576chars_in_text (ptr, nbytes)
8f924df7 577 const unsigned char *ptr;
13818c30 578 EMACS_INT nbytes;
0168c3d8
KH
579{
580 /* current_buffer is null at early stages of Emacs initialization. */
581 if (current_buffer == 0
582 || NILP (current_buffer->enable_multibyte_characters))
583 return nbytes;
584
585 return multibyte_chars_in_text (ptr, nbytes);
586}
587
588/* Return the number of characters in the NBYTES bytes at PTR.
589 This works by looking at the contents and checking for multibyte
590 sequences while assuming that there's no invalid sequence. It
591 ignores enable-multibyte-characters. */
592
13818c30 593EMACS_INT
0168c3d8 594multibyte_chars_in_text (ptr, nbytes)
8f924df7 595 const unsigned char *ptr;
13818c30 596 EMACS_INT nbytes;
0168c3d8 597{
8f924df7 598 const unsigned char *endp = ptr + nbytes;
0168c3d8
KH
599 int chars = 0;
600
601 while (ptr < endp)
602 {
603 int len = MULTIBYTE_LENGTH (ptr, endp);
604
605 if (len == 0)
606 abort ();
607 ptr += len;
608 chars++;
609 }
610
611 return chars;
612}
613
614/* Parse unibyte text at STR of LEN bytes as a multibyte text, count
615 characters and bytes in it, and store them in *NCHARS and *NBYTES
616 respectively. On counting bytes, pay attention to that 8-bit
617 characters not constructing a valid multibyte sequence are
618 represented by 2-byte in a multibyte text. */
619
620void
621parse_str_as_multibyte (str, len, nchars, nbytes)
8f924df7 622 const unsigned char *str;
0168c3d8
KH
623 int len, *nchars, *nbytes;
624{
8f924df7 625 const unsigned char *endp = str + len;
0168c3d8
KH
626 int n, chars = 0, bytes = 0;
627
628 if (len >= MAX_MULTIBYTE_LENGTH)
629 {
8f924df7 630 const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
0168c3d8
KH
631 while (str < adjusted_endp)
632 {
633 if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
634 str += n, bytes += n;
635 else
636 str++, bytes += 2;
637 chars++;
638 }
639 }
640 while (str < endp)
641 {
642 if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
643 str += n, bytes += n;
644 else
645 str++, bytes += 2;
646 chars++;
647 }
648
649 *nchars = chars;
650 *nbytes = bytes;
651 return;
652}
653
654/* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
655 It actually converts only such 8-bit characters that don't contruct
656 a multibyte sequence to multibyte forms of Latin-1 characters. If
657 NCHARS is nonzero, set *NCHARS to the number of characters in the
658 text. It is assured that we can use LEN bytes at STR as a work
659 area and that is enough. Return the number of bytes of the
660 resulting text. */
661
662int
663str_as_multibyte (str, len, nbytes, nchars)
664 unsigned char *str;
665 int len, nbytes, *nchars;
666{
667 unsigned char *p = str, *endp = str + nbytes;
668 unsigned char *to;
669 int chars = 0;
670 int n;
671
672 if (nbytes >= MAX_MULTIBYTE_LENGTH)
673 {
674 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
675 while (p < adjusted_endp
676 && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
677 p += n, chars++;
678 }
679 while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
680 p += n, chars++;
681 if (nchars)
682 *nchars = chars;
683 if (p == endp)
684 return nbytes;
685
686 to = p;
687 nbytes = endp - p;
688 endp = str + len;
689 safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
690 p = endp - nbytes;
691
692 if (nbytes >= MAX_MULTIBYTE_LENGTH)
693 {
694 unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
695 while (p < adjusted_endp)
696 {
697 if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
698 {
699 while (n--)
700 *to++ = *p++;
701 }
702 else
703 {
704 int c = *p++;
705 c = BYTE8_TO_CHAR (c);
706 to += CHAR_STRING (c, to);
707 }
708 }
709 chars++;
710 }
711 while (p < endp)
712 {
713 if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
714 {
715 while (n--)
716 *to++ = *p++;
8f924df7 717 }
0168c3d8
KH
718 else
719 {
720 int c = *p++;
721 c = BYTE8_TO_CHAR (c);
722 to += CHAR_STRING (c, to);
723 }
724 chars++;
725 }
726 if (nchars)
727 *nchars = chars;
728 return (to - str);
729}
730
731/* Parse unibyte string at STR of LEN bytes, and return the number of
732 bytes it may ocupy when converted to multibyte string by
733 `str_to_multibyte'. */
734
735int
736parse_str_to_multibyte (str, len)
737 unsigned char *str;
738 int len;
739{
740 unsigned char *endp = str + len;
741 int bytes;
742
743 for (bytes = 0; str < endp; str++)
744 bytes += (*str < 0x80) ? 1 : 2;
745 return bytes;
746}
747
748
749/* Convert unibyte text at STR of NBYTES bytes to a multibyte text
750 that contains the same single-byte characters. It actually
751 converts all 8-bit characters to multibyte forms. It is assured
752 that we can use LEN bytes at STR as a work area and that is
753 enough. */
754
755int
756str_to_multibyte (str, len, bytes)
757 unsigned char *str;
758 int len, bytes;
759{
760 unsigned char *p = str, *endp = str + bytes;
761 unsigned char *to;
762
763 while (p < endp && *p < 0x80) p++;
764 if (p == endp)
765 return bytes;
766 to = p;
767 bytes = endp - p;
768 endp = str + len;
769 safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
770 p = endp - bytes;
8f924df7 771 while (p < endp)
0168c3d8
KH
772 {
773 int c = *p++;
774
775 if (c >= 0x80)
776 c = BYTE8_TO_CHAR (c);
777 to += CHAR_STRING (c, to);
778 }
779 return (to - str);
780}
781
782/* Arrange multibyte text at STR of LEN bytes as a unibyte text. It
783 actually converts characters in the range 0x80..0xFF to
784 unibyte. */
785
786int
787str_as_unibyte (str, bytes)
788 unsigned char *str;
789 int bytes;
790{
15843e6f
KH
791 const unsigned char *p = str, *endp = str + bytes;
792 unsigned char *to;
0168c3d8
KH
793 int c, len;
794
795 while (p < endp)
796 {
797 c = *p;
798 len = BYTES_BY_CHAR_HEAD (c);
799 if (CHAR_BYTE8_HEAD_P (c))
800 break;
801 p += len;
802 }
15843e6f 803 to = str + (p - str);
8f924df7 804 while (p < endp)
0168c3d8
KH
805 {
806 c = *p;
807 len = BYTES_BY_CHAR_HEAD (c);
808 if (CHAR_BYTE8_HEAD_P (c))
809 {
810 c = STRING_CHAR_ADVANCE (p);
811 *to++ = CHAR_TO_BYTE8 (c);
812 }
813 else
814 {
815 while (len--) *to++ = *p++;
816 }
817 }
818 return (to - str);
819}
820
4aa40bb8
KH
821/* Convert eight-bit chars in SRC (in multibyte form) to the
822 corresponding byte and store in DST. CHARS is the number of
823 characters in SRC. The value is the number of bytes stored in DST.
824 Usually, the value is the same as CHARS, but is less than it if SRC
825 contains a non-ASCII, non-eight-bit characater. If ACCEPT_LATIN_1
826 is nonzero, a Latin-1 character is accepted and converted to a byte
f27f70ec
KH
827 of that character code.
828 Note: Currently the arg ACCEPT_LATIN_1 is not used. */
4aa40bb8
KH
829
830EMACS_INT
831str_to_unibyte (src, dst, chars, accept_latin_1)
832 const unsigned char *src;
833 unsigned char *dst;
834 EMACS_INT chars;
835 int accept_latin_1;
836{
837 EMACS_INT i;
838
839 for (i = 0; i < chars; i++)
840 {
841 int c = STRING_CHAR_ADVANCE (src);
842
843 if (CHAR_BYTE8_P (c))
844 c = CHAR_TO_BYTE8 (c);
845 else if (! ASCII_CHAR_P (c)
846 && (! accept_latin_1 || c >= 0x100))
847 return i;
848 *dst++ = c;
849 }
850 return i;
851}
852
853
0168c3d8
KH
854int
855string_count_byte8 (string)
856 Lisp_Object string;
857{
858 int multibyte = STRING_MULTIBYTE (string);
8f924df7
KH
859 int nbytes = SBYTES (string);
860 unsigned char *p = SDATA (string);
0168c3d8
KH
861 unsigned char *pend = p + nbytes;
862 int count = 0;
863 int c, len;
864
865 if (multibyte)
866 while (p < pend)
867 {
868 c = *p;
869 len = BYTES_BY_CHAR_HEAD (c);
870
871 if (CHAR_BYTE8_HEAD_P (c))
872 count++;
873 p += len;
874 }
875 else
876 while (p < pend)
877 {
878 if (*p++ >= 0x80)
879 count++;
880 }
881 return count;
882}
883
884
885Lisp_Object
886string_escape_byte8 (string)
887 Lisp_Object string;
888{
8f924df7
KH
889 int nchars = SCHARS (string);
890 int nbytes = SBYTES (string);
0168c3d8
KH
891 int multibyte = STRING_MULTIBYTE (string);
892 int byte8_count;
15843e6f
KH
893 const unsigned char *src, *src_end;
894 unsigned char *dst;
0168c3d8
KH
895 Lisp_Object val;
896 int c, len;
897
898 if (multibyte && nchars == nbytes)
899 return string;
900
901 byte8_count = string_count_byte8 (string);
902
903 if (byte8_count == 0)
904 return string;
905
906 if (multibyte)
907 /* Convert 2-byte sequence of byte8 chars to 4-byte octal. */
7b40ebaf 908 val = make_uninit_multibyte_string (nchars + byte8_count * 3,
0168c3d8
KH
909 nbytes + byte8_count * 2);
910 else
911 /* Convert 1-byte sequence of byte8 chars to 4-byte octal. */
912 val = make_uninit_string (nbytes + byte8_count * 3);
913
8f924df7 914 src = SDATA (string);
0168c3d8 915 src_end = src + nbytes;
8f924df7 916 dst = SDATA (val);
0168c3d8
KH
917 if (multibyte)
918 while (src < src_end)
919 {
920 c = *src;
921 len = BYTES_BY_CHAR_HEAD (c);
922
923 if (CHAR_BYTE8_HEAD_P (c))
924 {
925 c = STRING_CHAR_ADVANCE (src);
926 c = CHAR_TO_BYTE8 (c);
1889b238 927 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
928 dst += 4;
929 }
930 else
931 while (len--) *dst++ = *src++;
932 }
933 else
934 while (src < src_end)
935 {
936 c = *src++;
937 if (c >= 0x80)
938 {
1889b238 939 sprintf ((char *) dst, "\\%03o", c);
0168c3d8
KH
940 dst += 4;
941 }
942 else
943 *dst++ = c;
944 }
945 return val;
946}
947
948\f
8f924df7 949DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
0168c3d8 950 doc: /*
d2e83296
DL
951Concatenate all the argument characters and make the result a string.
952usage: (string &rest CHARACTERS) */)
0168c3d8
KH
953 (n, args)
954 int n;
955 Lisp_Object *args;
956{
957 int i;
958 unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
959 unsigned char *p = buf;
960 int c;
961
962 for (i = 0; i < n; i++)
963 {
964 CHECK_CHARACTER (args[i]);
965 c = XINT (args[i]);
966 p += CHAR_STRING (c, p);
967 }
968
969 return make_string_from_bytes ((char *) buf, n, p - buf);
970}
971
70b4969d 972DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
87d6f965
KH
973 doc: /* Concatenate all the argument bytes and make the result a unibyte string.
974usage: (unibyte-string &rest BYTES) */)
70b4969d
KH
975 (n, args)
976 int n;
977 Lisp_Object *args;
978{
979 int i;
980 unsigned char *buf = (unsigned char *) alloca (n);
981 unsigned char *p = buf;
982 unsigned c;
983
984 for (i = 0; i < n; i++)
985 {
986 CHECK_NATNUM (args[i]);
987 c = XFASTINT (args[i]);
988 if (c >= 256)
989 args_out_of_range_3 (args[i], make_number (0), make_number (255));
990 *p++ = c;
991 }
992
993 return make_string_from_bytes ((char *) buf, n, p - buf);
994}
995
c73ae4ae 996DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
d0363d44
KH
997 Schar_resolve_modifiers, 1, 1, 0,
998 doc: /* Resolve modifiers in the character CHAR.
999The value is a character with modifiers resolved into the character
1000code. Unresolved modifiers are kept in the value.
c73ae4ae 1001usage: (char-resolve-modifiers CHAR) */)
d0363d44
KH
1002 (character)
1003 Lisp_Object character;
1004{
1005 int c;
1006
1007 CHECK_NUMBER (character);
1008 c = XINT (character);
1009 return make_number (char_resolve_modifier_mask (c));
1010}
1011
ee107a89
KH
1012DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1013 doc: /* Return a byte value of a character at point.
1014Optional 1st arg POSITION, if non-nil, is a position of a character to get
1015a byte value.
1016Optional 2nd arg STRING, if non-nil, is a string of which first
1017character is a target to get a byte value. In this case, POSITION, if
1018non-nil, is an index of a target character in the string.
1019
1020If the current buffer (or STRING) is multibyte, and the target
1021character is not ASCII nor 8-bit character, an error is signalled. */)
1022 (position, string)
1023 Lisp_Object position, string;
1024{
1025 int c;
1026 EMACS_INT pos;
1027 unsigned char *p;
1028
1029 if (NILP (string))
1030 {
1031 if (NILP (position))
1032 {
1033 p = PT_ADDR;
4d8e170e 1034 }
ee107a89
KH
1035 else
1036 {
1037 CHECK_NUMBER_COERCE_MARKER (position);
1038 if (XINT (position) < BEGV || XINT (position) >= ZV)
1039 args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1040 pos = XFASTINT (position);
1041 p = CHAR_POS_ADDR (pos);
1042 }
d5998e03
KH
1043 if (NILP (current_buffer->enable_multibyte_characters))
1044 return make_number (*p);
ee107a89
KH
1045 }
1046 else
1047 {
1048 CHECK_STRING (string);
1049 if (NILP (position))
1050 {
1051 p = SDATA (string);
1052 }
1053 else
1054 {
1055 CHECK_NATNUM (position);
1056 if (XINT (position) >= SCHARS (string))
1057 args_out_of_range (string, position);
1058 pos = XFASTINT (position);
1059 p = SDATA (string) + string_char_to_byte (string, pos);
1060 }
d5998e03
KH
1061 if (! STRING_MULTIBYTE (string))
1062 return make_number (*p);
ee107a89 1063 }
62a6e103 1064 c = STRING_CHAR (p);
ee107a89
KH
1065 if (CHAR_BYTE8_P (c))
1066 c = CHAR_TO_BYTE8 (c);
1067 else if (! ASCII_CHAR_P (c))
1068 error ("Not an ASCII nor an 8-bit character: %d", c);
1069 return make_number (c);
1070}
1071
1072
0168c3d8
KH
1073void
1074init_character_once ()
1075{
1076}
1077
1078#ifdef emacs
1079
1080void
1081syms_of_character ()
1082{
1083 DEFSYM (Qcharacterp, "characterp");
1084 DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1085
1086 staticpro (&Vchar_unify_table);
1087 Vchar_unify_table = Qnil;
1088
1089 defsubr (&Smax_char);
1090 defsubr (&Scharacterp);
1091 defsubr (&Sunibyte_char_to_multibyte);
1092 defsubr (&Smultibyte_char_to_unibyte);
1093 defsubr (&Schar_bytes);
1094 defsubr (&Schar_width);
1095 defsubr (&Sstring_width);
1096 defsubr (&Schar_direction);
0168c3d8 1097 defsubr (&Sstring);
70b4969d 1098 defsubr (&Sunibyte_string);
d0363d44 1099 defsubr (&Schar_resolve_modifiers);
ee107a89 1100 defsubr (&Sget_byte);
0168c3d8
KH
1101
1102 DEFVAR_LISP ("translation-table-vector", &Vtranslation_table_vector,
1103 doc: /*
68978cf0
DL
1104Vector recording all translation tables ever defined.
1105Each element is a pair (SYMBOL . TABLE) relating the table to the
1106symbol naming it. The ID of a translation table is an index into this vector. */);
0168c3d8
KH
1107 Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1108
1109 DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1110 doc: /*
1111A char-table for characters which invoke auto-filling.
1112Such characters have value t in this table. */);
1113 Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
6cc0e1ca
DL
1114 CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1115 CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
0168c3d8
KH
1116
1117 DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1118 doc: /*
1119A char-table for width (columns) of each character. */);
1120 Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
be8b50bc
KH
1121 char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1122 char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1123 make_number (4));
0168c3d8
KH
1124
1125 DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1126 doc: /* A char-table for direction of each character. */);
1127 Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1128
1129 DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1130 doc: /* A char-table for each printable character. */);
db6d4189 1131 Vprintable_chars = Fmake_char_table (Qnil, Qnil);
67dde660
KH
1132 Fset_char_table_range (Vprintable_chars,
1133 Fcons (make_number (32), make_number (126)), Qt);
1134 Fset_char_table_range (Vprintable_chars,
1135 Fcons (make_number (160),
1136 make_number (MAX_5_BYTE_CHAR)), Qt);
15843e6f 1137
c57f3328
KH
1138 DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1139 doc: /* Char table of script symbols.
1140It has one extra slot whose value is a list of script symbols. */);
1141
1142 /* Intern this now in case it isn't already done.
1143 Setting this variable twice is harmless.
1144 But don't staticpro it here--that is done in alloc.c. */
d67b4f80 1145 Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
c57f3328
KH
1146 DEFSYM (Qchar_script_table, "char-script-table");
1147 Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1148 Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
c7e14352
KH
1149
1150 DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
c3bb7671 1151 doc: /* Alist of scripts vs the representative characters.
f4427a54 1152Each element is a cons (SCRIPT . CHARS).
4d8e170e 1153SCRIPT is a symbol representing a script or a subgroup of a script.
c3bb7671 1154CHARS is a list or a vector of characters.
472a4dc9 1155If it is a list, all characters in the list are necessary for supporting SCRIPT.
c3bb7671
KH
1156If it is a vector, one of the characters in the vector is necessary.
1157This variable is used to find a font for a specific script. */);
c7e14352 1158 Vscript_representative_chars = Qnil;
a3cbb631
KH
1159
1160 DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1161 doc: /* Char table of Unicode's "General Category".
472a4dc9
JB
1162All Unicode characters have one of the following values (symbol):
1163 Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
a3cbb631
KH
1164 Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1165See The Unicode Standard for the meaning of those values. */);
1166 /* The correct char-table is setup in characters.el. */
1167 Vunicode_category_table = Qnil;
0168c3d8
KH
1168}
1169
1170#endif /* emacs */
fbaf0946
MB
1171
1172/* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1173 (do not change this comment) */