src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2005, 2006 Free Software Foundation, Inc.
   5    Copyright (C) 2003, 2006
   6      National Institute of Advanced Industrial Science and Technology (AIST)
   7      Registration Number H13PRO009
   8
   9 This file is part of GNU Emacs.
  10
  11 GNU Emacs is free software; you can redistribute it and/or modify
  12 it under the terms of the GNU General Public License as published by
  13 the Free Software Foundation; either version 2, or (at your option)
  14 any later version.
  15
  16 GNU Emacs is distributed in the hope that it will be useful,
  17 but WITHOUT ANY WARRANTY; without even the implied warranty of
  18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  19 GNU General Public License for more details.
  20
  21 You should have received a copy of the GNU General Public License
  22 along with GNU Emacs; see the file COPYING.  If not, write to
  23 the Free Software Foundation, Inc., 59 Temple Place - Suite 330,
  24 Boston, MA 02111-1307, USA.  */
  25
  26 /* At first, see the document in `character.h' to understand the code
  27    in this file.  */
  28
  29 #ifdef emacs
  30 #include <config.h>
  31 #endif
  32
  33 #include <stdio.h>
  34
  35 #ifdef emacs
  36
  37 #include <sys/types.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 /* Alist of scripts vs representative characters.  */
  85 Lisp_Object Vscript_representative_chars;
  86
  87 static Lisp_Object Qchar_script_table;
  88
  89 /* Mapping table from unibyte chars to multibyte chars.  */
  90 int unibyte_to_multibyte_table[256];
  91
  92 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  93    char.  */
  94 char unibyte_has_multibyte_table[256];
  95
  96 \f
  97
  98 /* Store multibyte form of character C at P.  If C has modifier bits,
  99    handle them appropriately.  */
 100
 101 int
 102 char_string (c, p)
 103      unsigned c;
 104      unsigned char *p;
 105 {
 106   int bytes;
 107
 108   if (c & CHAR_MODIFIER_MASK)
 109     {
 110       /* As an non-ASCII character can't have modifier bits, we just
 111          ignore the bits.  */
 112       if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 113         {
 114           /* For Meta, Shift, and Control modifiers, we need special care.  */
 115           if (c & CHAR_META)
 116             {
 117               /* Move the meta bit to the right place for a string.  */
 118               c = (c & ~CHAR_META) | 0x80;
 119             }
 120           if (c & CHAR_SHIFT)
 121             {
 122               /* Shift modifier is valid only with [A-Za-z].  */
 123               if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 124                 c &= ~CHAR_SHIFT;
 125               else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 126                 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 127             }
 128           if (c & CHAR_CTL)
 129             {
 130               /* Simulate the code in lread.c.  */
 131               /* Allow `\C- ' and `\C-?'.  */
 132               if (c == (CHAR_CTL | ' '))
 133                 c = 0;
 134               else if (c == (CHAR_CTL | '?'))
 135                 c = 127;
 136               /* ASCII control chars are made from letters (both cases),
 137                  as well as the non-letters within 0100...0137.  */
 138               else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 139                 c &= (037 | (~0177 & ~CHAR_CTL));
 140               else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 141                 c &= (037 | (~0177 & ~CHAR_CTL));
 142             }
 143         }
 144
 145       /* If C still has any modifier bits, just ignore it.  */
 146       c &= ~CHAR_MODIFIER_MASK;
 147     }
 148
 149   MAYBE_UNIFY_CHAR (c);
 150
 151   if (c <= MAX_3_BYTE_CHAR)
 152     {
 153       bytes = CHAR_STRING (c, p);
 154     }
 155   else if (c <= MAX_4_BYTE_CHAR)
 156     {
 157       p[0] = (0xF0 | (c >> 18));
 158       p[1] = (0x80 | ((c >> 12) & 0x3F));
 159       p[2] = (0x80 | ((c >> 6) & 0x3F));
 160       p[3] = (0x80 | (c & 0x3F));
 161       bytes = 4;
 162     }
 163   else if (c <= MAX_5_BYTE_CHAR)
 164     {
 165       p[0] = 0xF8;
 166       p[1] = (0x80 | ((c >> 18) & 0x0F));
 167       p[2] = (0x80 | ((c >> 12) & 0x3F));
 168       p[3] = (0x80 | ((c >> 6) & 0x3F));
 169       p[4] = (0x80 | (c & 0x3F));
 170       bytes = 5;
 171     }
 172   else if (c <= MAX_CHAR)
 173     {
 174       c = CHAR_TO_BYTE8 (c);
 175       bytes = BYTE8_STRING (c, p);
 176     }
 177   else
 178     error ("Invalid character: %d", c);
 179
 180   return bytes;
 181 }
 182
 183
 184 /* Return a character whose multibyte form is at P.  Set LEN is not
 185    NULL, it must be a pointer to integer.  In that case, set *LEN to
 186    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 187    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 188    the ending address (i.e. the starting address of the next
 189    character) of the multibyte form.  */
 190
 191 int
 192 string_char (p, advanced, len)
 193      const unsigned char *p;
 194      const unsigned char **advanced;
 195      int *len;
 196 {
 197   int c;
 198   const unsigned char *saved_p = p;
 199
 200   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 201     {
 202       c = STRING_CHAR_ADVANCE (p);
 203     }
 204   else if (! (*p & 0x08))
 205     {
 206       c = ((((p)[0] & 0xF) << 18)
 207            | (((p)[1] & 0x3F) << 12)
 208            | (((p)[2] & 0x3F) << 6)
 209            | ((p)[3] & 0x3F));
 210       p += 4;
 211     }
 212   else
 213     {
 214       c = ((((p)[1] & 0x3F) << 18)
 215            | (((p)[2] & 0x3F) << 12)
 216            | (((p)[3] & 0x3F) << 6)
 217            | ((p)[4] & 0x3F));
 218       p += 5;
 219     }
 220
 221   MAYBE_UNIFY_CHAR (c);
 222
 223   if (len)
 224     *len = p - saved_p;
 225   if (advanced)
 226     *advanced = p;
 227   return c;
 228 }
 229
 230
 231 /* Translate character C by translation table TABLE.  If C is
 232    negative, translate a character specified by CHARSET and CODE.  If
 233    no translation is found in TABLE, return the untranslated
 234    character.  If TABLE is a list, elements are char tables.  In this
 235    case, translace C by all tables.  */
 236
 237 int
 238 translate_char (table, c)
 239      Lisp_Object table;
 240      int c;
 241 {
 242   if (CHAR_TABLE_P (table))
 243     {
 244       Lisp_Object ch;
 245
 246       ch = CHAR_TABLE_REF (table, c);
 247       if (CHARACTERP (ch))
 248         c = XINT (ch);
 249     }
 250   else
 251     {
 252       for (; CONSP (table); table = XCDR (table))
 253         c = translate_char (XCAR (table), c);
 254     }
 255   return c;
 256 }
 257
 258 /* Convert the multibyte character C to unibyte 8-bit character based
 259    on the current value of charset_unibyte.  If dimension of
 260    charset_unibyte is more than one, return (C & 0xFF).
 261
 262    The argument REV_TBL is now ignored.  It will be removed in the
 263    future.  */
 264
 265 int
 266 multibyte_char_to_unibyte (c, rev_tbl)
 267      int c;
 268      Lisp_Object rev_tbl;
 269 {
 270   struct charset *charset;
 271   unsigned c1;
 272
 273   if (CHAR_BYTE8_P (c))
 274     return CHAR_TO_BYTE8 (c);
 275   charset = CHARSET_FROM_ID (charset_unibyte);
 276   c1 = ENCODE_CHAR (charset, c);
 277   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 278 }
 279
 280 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 281    by charset_unibyte.  */
 282
 283 int
 284 multibyte_char_to_unibyte_safe (c)
 285      int c;
 286 {
 287   struct charset *charset;
 288   unsigned c1;
 289
 290   if (CHAR_BYTE8_P (c))
 291     return CHAR_TO_BYTE8 (c);
 292   charset = CHARSET_FROM_ID (charset_unibyte);
 293   c1 = ENCODE_CHAR (charset, c);
 294   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 295 }
 296
 297 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 298        doc: /* Return non-nil if OBJECT is a character.  */)
 299      (object, ignore)
 300      Lisp_Object object, ignore;
 301 {
 302   return (CHARACTERP (object) ? Qt : Qnil);
 303 }
 304
 305 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 306        doc: /* Return the character of the maximum code.  */)
 307      ()
 308 {
 309   return make_number (MAX_CHAR);
 310 }
 311
 312 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 313        Sunibyte_char_to_multibyte, 1, 1, 0,
 314        doc: /* Convert the unibyte character CH to multibyte character.
 315 The multibyte character is a result of decoding CH by
 316 the current unibyte charset (see `unibyte-charset').  */)
 317      (ch)
 318      Lisp_Object ch;
 319 {
 320   int c;
 321   struct charset *charset;
 322
 323   CHECK_CHARACTER (ch);
 324   c = XFASTINT (ch);
 325   if (c >= 0400)
 326     error ("Invalid unibyte character: %d", c);
 327   charset = CHARSET_FROM_ID (charset_unibyte);
 328   c = DECODE_CHAR (charset, c);
 329   if (c < 0)
 330     c = BYTE8_TO_CHAR (XFASTINT (ch));
 331   return make_number (c);
 332 }
 333
 334 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 335        Smultibyte_char_to_unibyte, 1, 1, 0,
 336        doc: /* Convert the multibyte character CH to unibyte character.\n\
 337 The unibyte character is a result of encoding CH by
 338 the current primary charset (value of `charset-primary').  */)
 339      (ch)
 340      Lisp_Object ch;
 341 {
 342   int c;
 343
 344   CHECK_CHARACTER (ch);
 345   c = XFASTINT (ch);
 346   c = CHAR_TO_BYTE8 (c);
 347   return make_number (c);
 348 }
 349
 350 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 351        doc: /* Return 1 regardless of the argument CHAR.
 352 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 353      (ch)
 354      Lisp_Object ch;
 355 {
 356   CHECK_CHARACTER (ch);
 357   return make_number (1);
 358 }
 359
 360 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 361        doc: /* Return width of CHAR when displayed in the current buffer.
 362 The width is measured by how many columns it occupies on the screen.
 363 Tab is taken to occupy `tab-width' columns.  */)
 364      (ch)
 365        Lisp_Object ch;
 366 {
 367   Lisp_Object disp;
 368   int c, width;
 369   struct Lisp_Char_Table *dp = buffer_display_table ();
 370
 371   CHECK_CHARACTER (ch);
 372   c = XINT (ch);
 373
 374   /* Get the way the display table would display it.  */
 375   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 376
 377   if (VECTORP (disp))
 378     width = ASIZE (disp);
 379   else
 380     width = CHAR_WIDTH (c);
 381
 382   return make_number (width);
 383 }
 384
 385 /* Return width of string STR of length LEN when displayed in the
 386    current buffer.  The width is measured by how many columns it
 387    occupies on the screen.  If PRECISION > 0, return the width of
 388    longest substring that doesn't exceed PRECISION, and set number of
 389    characters and bytes of the substring in *NCHARS and *NBYTES
 390    respectively.  */
 391
 392 int
 393 c_string_width (str, len, precision, nchars, nbytes)
 394      const unsigned char *str;
 395      int precision, *nchars, *nbytes;
 396 {
 397   int i = 0, i_byte = 0;
 398   int width = 0;
 399   struct Lisp_Char_Table *dp = buffer_display_table ();
 400
 401   while (i_byte < len)
 402     {
 403       int bytes, thiswidth;
 404       Lisp_Object val;
 405       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 406
 407       if (dp)
 408         {
 409           val = DISP_CHAR_VECTOR (dp, c);
 410           if (VECTORP (val))
 411             thiswidth = XVECTOR (val)->size;
 412           else
 413             thiswidth = CHAR_WIDTH (c);
 414         }
 415       else
 416         {
 417           thiswidth = CHAR_WIDTH (c);
 418         }
 419
 420       if (precision > 0
 421           && (width + thiswidth > precision))
 422         {
 423           *nchars = i;
 424           *nbytes = i_byte;
 425           return width;
 426         }
 427       i++;
 428       i_byte += bytes;
 429       width += thiswidth;
 430   }
 431
 432   if (precision > 0)
 433     {
 434       *nchars = i;
 435       *nbytes = i_byte;
 436     }
 437
 438   return width;
 439 }
 440
 441 /* Return width of string STR of length LEN when displayed in the
 442    current buffer.  The width is measured by how many columns it
 443    occupies on the screen.  */
 444
 445 int
 446 strwidth (str, len)
 447      unsigned char *str;
 448      int len;
 449 {
 450   return c_string_width (str, len, -1, NULL, NULL);
 451 }
 452
 453 /* Return width of Lisp string STRING when displayed in the current
 454    buffer.  The width is measured by how many columns it occupies on
 455    the screen while paying attention to compositions.  If PRECISION >
 456    0, return the width of longest substring that doesn't exceed
 457    PRECISION, and set number of characters and bytes of the substring
 458    in *NCHARS and *NBYTES respectively.  */
 459
 460 int
 461 lisp_string_width (string, precision, nchars, nbytes)
 462      Lisp_Object string;
 463      int precision, *nchars, *nbytes;
 464 {
 465   int len = SCHARS (string);
 466   /* This set multibyte to 0 even if STRING is multibyte when it
 467      contains only ascii and eight-bit-graphic, but that's
 468      intentional.  */
 469   int multibyte = len < SBYTES (string);
 470   unsigned char *str = SDATA (string);
 471   int i = 0, i_byte = 0;
 472   int width = 0;
 473   struct Lisp_Char_Table *dp = buffer_display_table ();
 474
 475   while (i < len)
 476     {
 477       int chars, bytes, thiswidth;
 478       Lisp_Object val;
 479       int cmp_id;
 480       EMACS_INT ignore, end;
 481
 482       if (find_composition (i, -1, &ignore, &end, &val, string)
 483           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 484               >= 0))
 485         {
 486           thiswidth = composition_table[cmp_id]->width;
 487           chars = end - i;
 488           bytes = string_char_to_byte (string, end) - i_byte;
 489         }
 490       else
 491         {
 492           int c;
 493
 494           if (multibyte)
 495             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 496           else
 497             c = str[i_byte], bytes = 1;
 498           chars = 1;
 499           if (dp)
 500             {
 501               val = DISP_CHAR_VECTOR (dp, c);
 502               if (VECTORP (val))
 503                 thiswidth = XVECTOR (val)->size;
 504               else
 505                 thiswidth = CHAR_WIDTH (c);
 506             }
 507           else
 508             {
 509               thiswidth = CHAR_WIDTH (c);
 510             }
 511         }
 512
 513       if (precision > 0
 514           && (width + thiswidth > precision))
 515         {
 516           *nchars = i;
 517           *nbytes = i_byte;
 518           return width;
 519         }
 520       i += chars;
 521       i_byte += bytes;
 522       width += thiswidth;
 523   }
 524
 525   if (precision > 0)
 526     {
 527       *nchars = i;
 528       *nbytes = i_byte;
 529     }
 530
 531   return width;
 532 }
 533
 534 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 535        doc: /* Return width of STRING when displayed in the current buffer.
 536 Width is measured by how many columns it occupies on the screen.
 537 When calculating width of a multibyte character in STRING,
 538 only the base leading-code is considered; the validity of
 539 the following bytes is not checked.  Tabs in STRING are always
 540 taken to occupy `tab-width' columns.  */)
 541      (str)
 542      Lisp_Object str;
 543 {
 544   Lisp_Object val;
 545
 546   CHECK_STRING (str);
 547   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 548   return val;
 549 }
 550
 551 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 552        doc: /* Return the direction of CHAR.
 553 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 554      (ch)
 555      Lisp_Object ch;
 556 {
 557   int c;
 558
 559   CHECK_CHARACTER (ch);
 560   c = XINT (ch);
 561   return CHAR_TABLE_REF (Vchar_direction_table, c);
 562 }
 563
 564 /* Return the number of characters in the NBYTES bytes at PTR.
 565    This works by looking at the contents and checking for multibyte
 566    sequences while assuming that there's no invalid sequence.
 567    However, if the current buffer has enable-multibyte-characters =
 568    nil, we treat each byte as a character.  */
 569
 570 int
 571 chars_in_text (ptr, nbytes)
 572      const unsigned char *ptr;
 573      int nbytes;
 574 {
 575   /* current_buffer is null at early stages of Emacs initialization.  */
 576   if (current_buffer == 0
 577       || NILP (current_buffer->enable_multibyte_characters))
 578     return nbytes;
 579
 580   return multibyte_chars_in_text (ptr, nbytes);
 581 }
 582
 583 /* Return the number of characters in the NBYTES bytes at PTR.
 584    This works by looking at the contents and checking for multibyte
 585    sequences while assuming that there's no invalid sequence.  It
 586    ignores enable-multibyte-characters.  */
 587
 588 int
 589 multibyte_chars_in_text (ptr, nbytes)
 590      const unsigned char *ptr;
 591      int nbytes;
 592 {
 593   const unsigned char *endp = ptr + nbytes;
 594   int chars = 0;
 595
 596   while (ptr < endp)
 597     {
 598       int len = MULTIBYTE_LENGTH (ptr, endp);
 599
 600       if (len == 0)
 601         abort ();
 602       ptr += len;
 603       chars++;
 604     }
 605
 606   return chars;
 607 }
 608
 609 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 610    characters and bytes in it, and store them in *NCHARS and *NBYTES
 611    respectively.  On counting bytes, pay attention to that 8-bit
 612    characters not constructing a valid multibyte sequence are
 613    represented by 2-byte in a multibyte text.  */
 614
 615 void
 616 parse_str_as_multibyte (str, len, nchars, nbytes)
 617      const unsigned char *str;
 618      int len, *nchars, *nbytes;
 619 {
 620   const unsigned char *endp = str + len;
 621   int n, chars = 0, bytes = 0;
 622
 623   if (len >= MAX_MULTIBYTE_LENGTH)
 624     {
 625       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 626       while (str < adjusted_endp)
 627         {
 628           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 629             str += n, bytes += n;
 630           else
 631             str++, bytes += 2;
 632           chars++;
 633         }
 634     }
 635   while (str < endp)
 636     {
 637       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 638         str += n, bytes += n;
 639       else
 640         str++, bytes += 2;
 641       chars++;
 642     }
 643
 644   *nchars = chars;
 645   *nbytes = bytes;
 646   return;
 647 }
 648
 649 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 650    It actually converts only such 8-bit characters that don't contruct
 651    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 652    NCHARS is nonzero, set *NCHARS to the number of characters in the
 653    text.  It is assured that we can use LEN bytes at STR as a work
 654    area and that is enough.  Return the number of bytes of the
 655    resulting text.  */
 656
 657 int
 658 str_as_multibyte (str, len, nbytes, nchars)
 659      unsigned char *str;
 660      int len, nbytes, *nchars;
 661 {
 662   unsigned char *p = str, *endp = str + nbytes;
 663   unsigned char *to;
 664   int chars = 0;
 665   int n;
 666
 667   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 668     {
 669       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 670       while (p < adjusted_endp
 671              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 672         p += n, chars++;
 673     }
 674   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 675     p += n, chars++;
 676   if (nchars)
 677     *nchars = chars;
 678   if (p == endp)
 679     return nbytes;
 680
 681   to = p;
 682   nbytes = endp - p;
 683   endp = str + len;
 684   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 685   p = endp - nbytes;
 686
 687   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 688     {
 689       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 690       while (p < adjusted_endp)
 691         {
 692           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 693             {
 694               while (n--)
 695                 *to++ = *p++;
 696             }
 697           else
 698             {
 699               int c = *p++;
 700               c = BYTE8_TO_CHAR (c);
 701               to += CHAR_STRING (c, to);
 702             }
 703         }
 704       chars++;
 705     }
 706   while (p < endp)
 707     {
 708       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 709         {
 710           while (n--)
 711             *to++ = *p++;
 712         }
 713       else
 714         {
 715           int c = *p++;
 716           c = BYTE8_TO_CHAR (c);
 717           to += CHAR_STRING (c, to);
 718         }
 719       chars++;
 720     }
 721   if (nchars)
 722     *nchars = chars;
 723   return (to - str);
 724 }
 725
 726 /* Parse unibyte string at STR of LEN bytes, and return the number of
 727    bytes it may ocupy when converted to multibyte string by
 728    `str_to_multibyte'.  */
 729
 730 int
 731 parse_str_to_multibyte (str, len)
 732      unsigned char *str;
 733      int len;
 734 {
 735   unsigned char *endp = str + len;
 736   int bytes;
 737
 738   for (bytes = 0; str < endp; str++)
 739     bytes += (*str < 0x80) ? 1 : 2;
 740   return bytes;
 741 }
 742
 743
 744 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 745    that contains the same single-byte characters.  It actually
 746    converts all 8-bit characters to multibyte forms.  It is assured
 747    that we can use LEN bytes at STR as a work area and that is
 748    enough.  */
 749
 750 int
 751 str_to_multibyte (str, len, bytes)
 752      unsigned char *str;
 753      int len, bytes;
 754 {
 755   unsigned char *p = str, *endp = str + bytes;
 756   unsigned char *to;
 757
 758   while (p < endp && *p < 0x80) p++;
 759   if (p == endp)
 760     return bytes;
 761   to = p;
 762   bytes = endp - p;
 763   endp = str + len;
 764   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 765   p = endp - bytes;
 766   while (p < endp)
 767     {
 768       int c = *p++;
 769
 770       if (c >= 0x80)
 771         c = BYTE8_TO_CHAR (c);
 772       to += CHAR_STRING (c, to);
 773     }
 774   return (to - str);
 775 }
 776
 777 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 778    actually converts characters in the range 0x80..0xFF to
 779    unibyte.  */
 780
 781 int
 782 str_as_unibyte (str, bytes)
 783      unsigned char *str;
 784      int bytes;
 785 {
 786   const unsigned char *p = str, *endp = str + bytes;
 787   unsigned char *to;
 788   int c, len;
 789
 790   while (p < endp)
 791     {
 792       c = *p;
 793       len = BYTES_BY_CHAR_HEAD (c);
 794       if (CHAR_BYTE8_HEAD_P (c))
 795         break;
 796       p += len;
 797     }
 798   to = str + (p - str);
 799   while (p < endp)
 800     {
 801       c = *p;
 802       len = BYTES_BY_CHAR_HEAD (c);
 803       if (CHAR_BYTE8_HEAD_P (c))
 804         {
 805           c = STRING_CHAR_ADVANCE (p);
 806           *to++ = CHAR_TO_BYTE8 (c);
 807         }
 808       else
 809         {
 810           while (len--) *to++ = *p++;
 811         }
 812     }
 813   return (to - str);
 814 }
 815
 816 int
 817 string_count_byte8 (string)
 818      Lisp_Object string;
 819 {
 820   int multibyte = STRING_MULTIBYTE (string);
 821   int nbytes = SBYTES (string);
 822   unsigned char *p = SDATA (string);
 823   unsigned char *pend = p + nbytes;
 824   int count = 0;
 825   int c, len;
 826
 827   if (multibyte)
 828     while (p < pend)
 829       {
 830         c = *p;
 831         len = BYTES_BY_CHAR_HEAD (c);
 832
 833         if (CHAR_BYTE8_HEAD_P (c))
 834           count++;
 835         p += len;
 836       }
 837   else
 838     while (p < pend)
 839       {
 840         if (*p++ >= 0x80)
 841           count++;
 842       }
 843   return count;
 844 }
 845
 846
 847 Lisp_Object
 848 string_escape_byte8 (string)
 849      Lisp_Object string;
 850 {
 851   int nchars = SCHARS (string);
 852   int nbytes = SBYTES (string);
 853   int multibyte = STRING_MULTIBYTE (string);
 854   int byte8_count;
 855   const unsigned char *src, *src_end;
 856   unsigned char *dst;
 857   Lisp_Object val;
 858   int c, len;
 859
 860   if (multibyte && nchars == nbytes)
 861     return string;
 862
 863   byte8_count = string_count_byte8 (string);
 864
 865   if (byte8_count == 0)
 866     return string;
 867
 868   if (multibyte)
 869     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 870     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 871                                         nbytes + byte8_count * 2);
 872   else
 873     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 874     val = make_uninit_string (nbytes + byte8_count * 3);
 875
 876   src = SDATA (string);
 877   src_end = src + nbytes;
 878   dst = SDATA (val);
 879   if (multibyte)
 880     while (src < src_end)
 881       {
 882         c = *src;
 883         len = BYTES_BY_CHAR_HEAD (c);
 884
 885         if (CHAR_BYTE8_HEAD_P (c))
 886           {
 887             c = STRING_CHAR_ADVANCE (src);
 888             c = CHAR_TO_BYTE8 (c);
 889             sprintf ((char *) dst, "\\%03o", c);
 890             dst += 4;
 891           }
 892         else
 893           while (len--) *dst++ = *src++;
 894       }
 895   else
 896     while (src < src_end)
 897       {
 898         c = *src++;
 899         if (c >= 0x80)
 900           {
 901             sprintf ((char *) dst, "\\%03o", c);
 902             dst += 4;
 903           }
 904         else
 905           *dst++ = c;
 906       }
 907   return val;
 908 }
 909
 910 \f
 911 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 912        doc: /*
 913 Concatenate all the argument characters and make the result a string.
 914 usage: (string &rest CHARACTERS)  */)
 915      (n, args)
 916      int n;
 917      Lisp_Object *args;
 918 {
 919   int i;
 920   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 921   unsigned char *p = buf;
 922   int c;
 923
 924   for (i = 0; i < n; i++)
 925     {
 926       CHECK_CHARACTER (args[i]);
 927       c = XINT (args[i]);
 928       p += CHAR_STRING (c, p);
 929     }
 930
 931   return make_string_from_bytes ((char *) buf, n, p - buf);
 932 }
 933
 934 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 935        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 936 usage: (unibyte-string &rest BYTES)  */)
 937      (n, args)
 938      int n;
 939      Lisp_Object *args;
 940 {
 941   int i;
 942   unsigned char *buf = (unsigned char *) alloca (n);
 943   unsigned char *p = buf;
 944   unsigned c;
 945
 946   for (i = 0; i < n; i++)
 947     {
 948       CHECK_NATNUM (args[i]);
 949       c = XFASTINT (args[i]);
 950       if (c >= 256)
 951         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 952       *p++ = c;
 953     }
 954
 955   return make_string_from_bytes ((char *) buf, n, p - buf);
 956 }
 957
 958 void
 959 init_character_once ()
 960 {
 961 }
 962
 963 #ifdef emacs
 964
 965 void
 966 syms_of_character ()
 967 {
 968   DEFSYM (Qcharacterp, "characterp");
 969   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 970
 971   staticpro (&Vchar_unify_table);
 972   Vchar_unify_table = Qnil;
 973
 974   defsubr (&Smax_char);
 975   defsubr (&Scharacterp);
 976   defsubr (&Sunibyte_char_to_multibyte);
 977   defsubr (&Smultibyte_char_to_unibyte);
 978   defsubr (&Schar_bytes);
 979   defsubr (&Schar_width);
 980   defsubr (&Sstring_width);
 981   defsubr (&Schar_direction);
 982   defsubr (&Sstring);
 983   defsubr (&Sunibyte_string);
 984
 985   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 986                doc: /*
 987 Vector recording all translation tables ever defined.
 988 Each element is a pair (SYMBOL . TABLE) relating the table to the
 989 symbol naming it.  The ID of a translation table is an index into this vector.  */);
 990   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
 991
 992   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
 993                doc: /*
 994 A char-table for characters which invoke auto-filling.
 995 Such characters have value t in this table.  */);
 996   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
 997   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
 998   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
 999
1000   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1001                doc: /*
1002 A char-table for width (columns) of each character.  */);
1003   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1004   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1005   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1006                         make_number (4));
1007
1008   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1009                doc: /* A char-table for direction of each character.  */);
1010   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1011
1012   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1013                doc: /* A char-table for each printable character.  */);
1014   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1015   Fset_char_table_range (Vprintable_chars,
1016                          Fcons (make_number (32), make_number (126)), Qt);
1017   Fset_char_table_range (Vprintable_chars,
1018                          Fcons (make_number (160),
1019                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1020
1021   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1022                doc: /* Char table of script symbols.
1023 It has one extra slot whose value is a list of script symbols.  */);
1024
1025   /* Intern this now in case it isn't already done.
1026      Setting this variable twice is harmless.
1027      But don't staticpro it here--that is done in alloc.c.  */
1028   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1029   DEFSYM (Qchar_script_table, "char-script-table");
1030   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1031   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1032
1033   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1034                doc: /* Alist of scripts vs the representative characters.  */);
1035   Vscript_representative_chars = Qnil;
1036 }
1037
1038 #endif /* emacs */
1039
1040 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1041    (do not change this comment) */