src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software; you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation; either version 3, or (at your option)
  15 any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs; see the file COPYING.  If not, write to the
  24 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  25 Boston, MA 02110-1301, USA.  */
  26
  27 /* At first, see the document in `character.h' to understand the code
  28    in this file.  */
  29
  30 #ifdef emacs
  31 #include <config.h>
  32 #endif
  33
  34 #include <stdio.h>
  35
  36 #ifdef emacs
  37
  38 #include <sys/types.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 /* Vector of translation table ever defined.
  55    ID of a translation table is used to index this vector.  */
  56 Lisp_Object Vtranslation_table_vector;
  57
  58 /* A char-table for characters which may invoke auto-filling.  */
  59 Lisp_Object Vauto_fill_chars;
  60
  61 Lisp_Object Qauto_fill_chars;
  62
  63 /* Char-table of information about which character to unify to which
  64    Unicode character.  */
  65 Lisp_Object Vchar_unify_table;
  66
  67 /* A char-table.  An element is non-nil iff the corresponding
  68    character has a printable glyph.  */
  69 Lisp_Object Vprintable_chars;
  70
  71 /* A char-table.  An elemnent is a column-width of the corresponding
  72    character.  */
  73 Lisp_Object Vchar_width_table;
  74
  75 /* A char-table.  An element is a symbol indicating the direction
  76    property of corresponding character.  */
  77 Lisp_Object Vchar_direction_table;
  78
  79 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  80 unsigned char *_fetch_multibyte_char_p;
  81
  82 /* Char table of scripts.  */
  83 Lisp_Object Vchar_script_table;
  84
  85 /* Alist of scripts vs representative characters.  */
  86 Lisp_Object Vscript_representative_chars;
  87
  88 static Lisp_Object Qchar_script_table;
  89
  90 /* Mapping table from unibyte chars to multibyte chars.  */
  91 int unibyte_to_multibyte_table[256];
  92
  93 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  94    char.  */
  95 char unibyte_has_multibyte_table[256];
  96
  97 \f
  98
  99 /* Store multibyte form of character C at P.  If C has modifier bits,
 100    handle them appropriately.  */
 101
 102 int
 103 char_string (c, p)
 104      unsigned c;
 105      unsigned char *p;
 106 {
 107   int bytes;
 108
 109   if (c & CHAR_MODIFIER_MASK)
 110     {
 111       /* As an non-ASCII character can't have modifier bits, we just
 112          ignore the bits.  */
 113       if (ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 114         {
 115           /* For Meta, Shift, and Control modifiers, we need special care.  */
 116           if (c & CHAR_META)
 117             {
 118               /* Move the meta bit to the right place for a string.  */
 119               c = (c & ~CHAR_META) | 0x80;
 120             }
 121           if (c & CHAR_SHIFT)
 122             {
 123               /* Shift modifier is valid only with [A-Za-z].  */
 124               if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 125                 c &= ~CHAR_SHIFT;
 126               else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 127                 c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 128             }
 129           if (c & CHAR_CTL)
 130             {
 131               /* Simulate the code in lread.c.  */
 132               /* Allow `\C- ' and `\C-?'.  */
 133               if (c == (CHAR_CTL | ' '))
 134                 c = 0;
 135               else if (c == (CHAR_CTL | '?'))
 136                 c = 127;
 137               /* ASCII control chars are made from letters (both cases),
 138                  as well as the non-letters within 0100...0137.  */
 139               else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 140                 c &= (037 | (~0177 & ~CHAR_CTL));
 141               else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 142                 c &= (037 | (~0177 & ~CHAR_CTL));
 143             }
 144         }
 145
 146       /* If C still has any modifier bits, just ignore it.  */
 147       c &= ~CHAR_MODIFIER_MASK;
 148     }
 149
 150   MAYBE_UNIFY_CHAR (c);
 151
 152   if (c <= MAX_3_BYTE_CHAR)
 153     {
 154       bytes = CHAR_STRING (c, p);
 155     }
 156   else if (c <= MAX_4_BYTE_CHAR)
 157     {
 158       p[0] = (0xF0 | (c >> 18));
 159       p[1] = (0x80 | ((c >> 12) & 0x3F));
 160       p[2] = (0x80 | ((c >> 6) & 0x3F));
 161       p[3] = (0x80 | (c & 0x3F));
 162       bytes = 4;
 163     }
 164   else if (c <= MAX_5_BYTE_CHAR)
 165     {
 166       p[0] = 0xF8;
 167       p[1] = (0x80 | ((c >> 18) & 0x0F));
 168       p[2] = (0x80 | ((c >> 12) & 0x3F));
 169       p[3] = (0x80 | ((c >> 6) & 0x3F));
 170       p[4] = (0x80 | (c & 0x3F));
 171       bytes = 5;
 172     }
 173   else if (c <= MAX_CHAR)
 174     {
 175       c = CHAR_TO_BYTE8 (c);
 176       bytes = BYTE8_STRING (c, p);
 177     }
 178   else
 179     error ("Invalid character: %d", c);
 180
 181   return bytes;
 182 }
 183
 184
 185 /* Return a character whose multibyte form is at P.  Set LEN is not
 186    NULL, it must be a pointer to integer.  In that case, set *LEN to
 187    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 188    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 189    the ending address (i.e. the starting address of the next
 190    character) of the multibyte form.  */
 191
 192 int
 193 string_char (p, advanced, len)
 194      const unsigned char *p;
 195      const unsigned char **advanced;
 196      int *len;
 197 {
 198   int c;
 199   const unsigned char *saved_p = p;
 200
 201   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 202     {
 203       c = STRING_CHAR_ADVANCE (p);
 204     }
 205   else if (! (*p & 0x08))
 206     {
 207       c = ((((p)[0] & 0xF) << 18)
 208            | (((p)[1] & 0x3F) << 12)
 209            | (((p)[2] & 0x3F) << 6)
 210            | ((p)[3] & 0x3F));
 211       p += 4;
 212     }
 213   else
 214     {
 215       c = ((((p)[1] & 0x3F) << 18)
 216            | (((p)[2] & 0x3F) << 12)
 217            | (((p)[3] & 0x3F) << 6)
 218            | ((p)[4] & 0x3F));
 219       p += 5;
 220     }
 221
 222   MAYBE_UNIFY_CHAR (c);
 223
 224   if (len)
 225     *len = p - saved_p;
 226   if (advanced)
 227     *advanced = p;
 228   return c;
 229 }
 230
 231
 232 /* Translate character C by translation table TABLE.  If C is
 233    negative, translate a character specified by CHARSET and CODE.  If
 234    no translation is found in TABLE, return the untranslated
 235    character.  If TABLE is a list, elements are char tables.  In this
 236    case, translace C by all tables.  */
 237
 238 int
 239 translate_char (table, c)
 240      Lisp_Object table;
 241      int c;
 242 {
 243   if (CHAR_TABLE_P (table))
 244     {
 245       Lisp_Object ch;
 246
 247       ch = CHAR_TABLE_REF (table, c);
 248       if (CHARACTERP (ch))
 249         c = XINT (ch);
 250     }
 251   else
 252     {
 253       for (; CONSP (table); table = XCDR (table))
 254         c = translate_char (XCAR (table), c);
 255     }
 256   return c;
 257 }
 258
 259 /* Convert the multibyte character C to unibyte 8-bit character based
 260    on the current value of charset_unibyte.  If dimension of
 261    charset_unibyte is more than one, return (C & 0xFF).
 262
 263    The argument REV_TBL is now ignored.  It will be removed in the
 264    future.  */
 265
 266 int
 267 multibyte_char_to_unibyte (c, rev_tbl)
 268      int c;
 269      Lisp_Object rev_tbl;
 270 {
 271   struct charset *charset;
 272   unsigned c1;
 273
 274   if (CHAR_BYTE8_P (c))
 275     return CHAR_TO_BYTE8 (c);
 276   charset = CHARSET_FROM_ID (charset_unibyte);
 277   c1 = ENCODE_CHAR (charset, c);
 278   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 279 }
 280
 281 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 282    by charset_unibyte.  */
 283
 284 int
 285 multibyte_char_to_unibyte_safe (c)
 286      int c;
 287 {
 288   struct charset *charset;
 289   unsigned c1;
 290
 291   if (CHAR_BYTE8_P (c))
 292     return CHAR_TO_BYTE8 (c);
 293   charset = CHARSET_FROM_ID (charset_unibyte);
 294   c1 = ENCODE_CHAR (charset, c);
 295   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 296 }
 297
 298 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 299        doc: /* Return non-nil if OBJECT is a character.  */)
 300      (object, ignore)
 301      Lisp_Object object, ignore;
 302 {
 303   return (CHARACTERP (object) ? Qt : Qnil);
 304 }
 305
 306 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 307        doc: /* Return the character of the maximum code.  */)
 308      ()
 309 {
 310   return make_number (MAX_CHAR);
 311 }
 312
 313 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 314        Sunibyte_char_to_multibyte, 1, 1, 0,
 315        doc: /* Convert the unibyte character CH to multibyte character.
 316 The multibyte character is a result of decoding CH by
 317 the current unibyte charset (see `unibyte-charset').  */)
 318      (ch)
 319      Lisp_Object ch;
 320 {
 321   int c;
 322   struct charset *charset;
 323
 324   CHECK_CHARACTER (ch);
 325   c = XFASTINT (ch);
 326   if (c >= 0400)
 327     error ("Invalid unibyte character: %d", c);
 328   charset = CHARSET_FROM_ID (charset_unibyte);
 329   c = DECODE_CHAR (charset, c);
 330   if (c < 0)
 331     c = BYTE8_TO_CHAR (XFASTINT (ch));
 332   return make_number (c);
 333 }
 334
 335 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 336        Smultibyte_char_to_unibyte, 1, 1, 0,
 337        doc: /* Convert the multibyte character CH to unibyte character.\n\
 338 The unibyte character is a result of encoding CH by
 339 the current primary charset (value of `charset-primary').  */)
 340      (ch)
 341      Lisp_Object ch;
 342 {
 343   int c;
 344
 345   CHECK_CHARACTER (ch);
 346   c = XFASTINT (ch);
 347   c = CHAR_TO_BYTE8 (c);
 348   return make_number (c);
 349 }
 350
 351 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 352        doc: /* Return 1 regardless of the argument CHAR.
 353 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 354      (ch)
 355      Lisp_Object ch;
 356 {
 357   CHECK_CHARACTER (ch);
 358   return make_number (1);
 359 }
 360
 361 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 362        doc: /* Return width of CHAR when displayed in the current buffer.
 363 The width is measured by how many columns it occupies on the screen.
 364 Tab is taken to occupy `tab-width' columns.  */)
 365      (ch)
 366        Lisp_Object ch;
 367 {
 368   Lisp_Object disp;
 369   int c, width;
 370   struct Lisp_Char_Table *dp = buffer_display_table ();
 371
 372   CHECK_CHARACTER (ch);
 373   c = XINT (ch);
 374
 375   /* Get the way the display table would display it.  */
 376   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 377
 378   if (VECTORP (disp))
 379     width = ASIZE (disp);
 380   else
 381     width = CHAR_WIDTH (c);
 382
 383   return make_number (width);
 384 }
 385
 386 /* Return width of string STR of length LEN when displayed in the
 387    current buffer.  The width is measured by how many columns it
 388    occupies on the screen.  If PRECISION > 0, return the width of
 389    longest substring that doesn't exceed PRECISION, and set number of
 390    characters and bytes of the substring in *NCHARS and *NBYTES
 391    respectively.  */
 392
 393 int
 394 c_string_width (str, len, precision, nchars, nbytes)
 395      const unsigned char *str;
 396      int precision, *nchars, *nbytes;
 397 {
 398   int i = 0, i_byte = 0;
 399   int width = 0;
 400   struct Lisp_Char_Table *dp = buffer_display_table ();
 401
 402   while (i_byte < len)
 403     {
 404       int bytes, thiswidth;
 405       Lisp_Object val;
 406       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 407
 408       if (dp)
 409         {
 410           val = DISP_CHAR_VECTOR (dp, c);
 411           if (VECTORP (val))
 412             thiswidth = XVECTOR (val)->size;
 413           else
 414             thiswidth = CHAR_WIDTH (c);
 415         }
 416       else
 417         {
 418           thiswidth = CHAR_WIDTH (c);
 419         }
 420
 421       if (precision > 0
 422           && (width + thiswidth > precision))
 423         {
 424           *nchars = i;
 425           *nbytes = i_byte;
 426           return width;
 427         }
 428       i++;
 429       i_byte += bytes;
 430       width += thiswidth;
 431   }
 432
 433   if (precision > 0)
 434     {
 435       *nchars = i;
 436       *nbytes = i_byte;
 437     }
 438
 439   return width;
 440 }
 441
 442 /* Return width of string STR of length LEN when displayed in the
 443    current buffer.  The width is measured by how many columns it
 444    occupies on the screen.  */
 445
 446 int
 447 strwidth (str, len)
 448      unsigned char *str;
 449      int len;
 450 {
 451   return c_string_width (str, len, -1, NULL, NULL);
 452 }
 453
 454 /* Return width of Lisp string STRING when displayed in the current
 455    buffer.  The width is measured by how many columns it occupies on
 456    the screen while paying attention to compositions.  If PRECISION >
 457    0, return the width of longest substring that doesn't exceed
 458    PRECISION, and set number of characters and bytes of the substring
 459    in *NCHARS and *NBYTES respectively.  */
 460
 461 int
 462 lisp_string_width (string, precision, nchars, nbytes)
 463      Lisp_Object string;
 464      int precision, *nchars, *nbytes;
 465 {
 466   int len = SCHARS (string);
 467   /* This set multibyte to 0 even if STRING is multibyte when it
 468      contains only ascii and eight-bit-graphic, but that's
 469      intentional.  */
 470   int multibyte = len < SBYTES (string);
 471   unsigned char *str = SDATA (string);
 472   int i = 0, i_byte = 0;
 473   int width = 0;
 474   struct Lisp_Char_Table *dp = buffer_display_table ();
 475
 476   while (i < len)
 477     {
 478       int chars, bytes, thiswidth;
 479       Lisp_Object val;
 480       int cmp_id;
 481       EMACS_INT ignore, end;
 482
 483       if (find_composition (i, -1, &ignore, &end, &val, string)
 484           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 485               >= 0))
 486         {
 487           thiswidth = composition_table[cmp_id]->width;
 488           chars = end - i;
 489           bytes = string_char_to_byte (string, end) - i_byte;
 490         }
 491       else
 492         {
 493           int c;
 494
 495           if (multibyte)
 496             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 497           else
 498             c = str[i_byte], bytes = 1;
 499           chars = 1;
 500           if (dp)
 501             {
 502               val = DISP_CHAR_VECTOR (dp, c);
 503               if (VECTORP (val))
 504                 thiswidth = XVECTOR (val)->size;
 505               else
 506                 thiswidth = CHAR_WIDTH (c);
 507             }
 508           else
 509             {
 510               thiswidth = CHAR_WIDTH (c);
 511             }
 512         }
 513
 514       if (precision > 0
 515           && (width + thiswidth > precision))
 516         {
 517           *nchars = i;
 518           *nbytes = i_byte;
 519           return width;
 520         }
 521       i += chars;
 522       i_byte += bytes;
 523       width += thiswidth;
 524   }
 525
 526   if (precision > 0)
 527     {
 528       *nchars = i;
 529       *nbytes = i_byte;
 530     }
 531
 532   return width;
 533 }
 534
 535 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 536        doc: /* Return width of STRING when displayed in the current buffer.
 537 Width is measured by how many columns it occupies on the screen.
 538 When calculating width of a multibyte character in STRING,
 539 only the base leading-code is considered; the validity of
 540 the following bytes is not checked.  Tabs in STRING are always
 541 taken to occupy `tab-width' columns.  */)
 542      (str)
 543      Lisp_Object str;
 544 {
 545   Lisp_Object val;
 546
 547   CHECK_STRING (str);
 548   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 549   return val;
 550 }
 551
 552 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 553        doc: /* Return the direction of CHAR.
 554 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 555      (ch)
 556      Lisp_Object ch;
 557 {
 558   int c;
 559
 560   CHECK_CHARACTER (ch);
 561   c = XINT (ch);
 562   return CHAR_TABLE_REF (Vchar_direction_table, c);
 563 }
 564
 565 /* Return the number of characters in the NBYTES bytes at PTR.
 566    This works by looking at the contents and checking for multibyte
 567    sequences while assuming that there's no invalid sequence.
 568    However, if the current buffer has enable-multibyte-characters =
 569    nil, we treat each byte as a character.  */
 570
 571 int
 572 chars_in_text (ptr, nbytes)
 573      const unsigned char *ptr;
 574      int nbytes;
 575 {
 576   /* current_buffer is null at early stages of Emacs initialization.  */
 577   if (current_buffer == 0
 578       || NILP (current_buffer->enable_multibyte_characters))
 579     return nbytes;
 580
 581   return multibyte_chars_in_text (ptr, nbytes);
 582 }
 583
 584 /* Return the number of characters in the NBYTES bytes at PTR.
 585    This works by looking at the contents and checking for multibyte
 586    sequences while assuming that there's no invalid sequence.  It
 587    ignores enable-multibyte-characters.  */
 588
 589 int
 590 multibyte_chars_in_text (ptr, nbytes)
 591      const unsigned char *ptr;
 592      int nbytes;
 593 {
 594   const unsigned char *endp = ptr + nbytes;
 595   int chars = 0;
 596
 597   while (ptr < endp)
 598     {
 599       int len = MULTIBYTE_LENGTH (ptr, endp);
 600
 601       if (len == 0)
 602         abort ();
 603       ptr += len;
 604       chars++;
 605     }
 606
 607   return chars;
 608 }
 609
 610 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 611    characters and bytes in it, and store them in *NCHARS and *NBYTES
 612    respectively.  On counting bytes, pay attention to that 8-bit
 613    characters not constructing a valid multibyte sequence are
 614    represented by 2-byte in a multibyte text.  */
 615
 616 void
 617 parse_str_as_multibyte (str, len, nchars, nbytes)
 618      const unsigned char *str;
 619      int len, *nchars, *nbytes;
 620 {
 621   const unsigned char *endp = str + len;
 622   int n, chars = 0, bytes = 0;
 623
 624   if (len >= MAX_MULTIBYTE_LENGTH)
 625     {
 626       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 627       while (str < adjusted_endp)
 628         {
 629           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 630             str += n, bytes += n;
 631           else
 632             str++, bytes += 2;
 633           chars++;
 634         }
 635     }
 636   while (str < endp)
 637     {
 638       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 639         str += n, bytes += n;
 640       else
 641         str++, bytes += 2;
 642       chars++;
 643     }
 644
 645   *nchars = chars;
 646   *nbytes = bytes;
 647   return;
 648 }
 649
 650 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 651    It actually converts only such 8-bit characters that don't contruct
 652    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 653    NCHARS is nonzero, set *NCHARS to the number of characters in the
 654    text.  It is assured that we can use LEN bytes at STR as a work
 655    area and that is enough.  Return the number of bytes of the
 656    resulting text.  */
 657
 658 int
 659 str_as_multibyte (str, len, nbytes, nchars)
 660      unsigned char *str;
 661      int len, nbytes, *nchars;
 662 {
 663   unsigned char *p = str, *endp = str + nbytes;
 664   unsigned char *to;
 665   int chars = 0;
 666   int n;
 667
 668   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 669     {
 670       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 671       while (p < adjusted_endp
 672              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 673         p += n, chars++;
 674     }
 675   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 676     p += n, chars++;
 677   if (nchars)
 678     *nchars = chars;
 679   if (p == endp)
 680     return nbytes;
 681
 682   to = p;
 683   nbytes = endp - p;
 684   endp = str + len;
 685   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 686   p = endp - nbytes;
 687
 688   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 689     {
 690       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 691       while (p < adjusted_endp)
 692         {
 693           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 694             {
 695               while (n--)
 696                 *to++ = *p++;
 697             }
 698           else
 699             {
 700               int c = *p++;
 701               c = BYTE8_TO_CHAR (c);
 702               to += CHAR_STRING (c, to);
 703             }
 704         }
 705       chars++;
 706     }
 707   while (p < endp)
 708     {
 709       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 710         {
 711           while (n--)
 712             *to++ = *p++;
 713         }
 714       else
 715         {
 716           int c = *p++;
 717           c = BYTE8_TO_CHAR (c);
 718           to += CHAR_STRING (c, to);
 719         }
 720       chars++;
 721     }
 722   if (nchars)
 723     *nchars = chars;
 724   return (to - str);
 725 }
 726
 727 /* Parse unibyte string at STR of LEN bytes, and return the number of
 728    bytes it may ocupy when converted to multibyte string by
 729    `str_to_multibyte'.  */
 730
 731 int
 732 parse_str_to_multibyte (str, len)
 733      unsigned char *str;
 734      int len;
 735 {
 736   unsigned char *endp = str + len;
 737   int bytes;
 738
 739   for (bytes = 0; str < endp; str++)
 740     bytes += (*str < 0x80) ? 1 : 2;
 741   return bytes;
 742 }
 743
 744
 745 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 746    that contains the same single-byte characters.  It actually
 747    converts all 8-bit characters to multibyte forms.  It is assured
 748    that we can use LEN bytes at STR as a work area and that is
 749    enough.  */
 750
 751 int
 752 str_to_multibyte (str, len, bytes)
 753      unsigned char *str;
 754      int len, bytes;
 755 {
 756   unsigned char *p = str, *endp = str + bytes;
 757   unsigned char *to;
 758
 759   while (p < endp && *p < 0x80) p++;
 760   if (p == endp)
 761     return bytes;
 762   to = p;
 763   bytes = endp - p;
 764   endp = str + len;
 765   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 766   p = endp - bytes;
 767   while (p < endp)
 768     {
 769       int c = *p++;
 770
 771       if (c >= 0x80)
 772         c = BYTE8_TO_CHAR (c);
 773       to += CHAR_STRING (c, to);
 774     }
 775   return (to - str);
 776 }
 777
 778 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 779    actually converts characters in the range 0x80..0xFF to
 780    unibyte.  */
 781
 782 int
 783 str_as_unibyte (str, bytes)
 784      unsigned char *str;
 785      int bytes;
 786 {
 787   const unsigned char *p = str, *endp = str + bytes;
 788   unsigned char *to;
 789   int c, len;
 790
 791   while (p < endp)
 792     {
 793       c = *p;
 794       len = BYTES_BY_CHAR_HEAD (c);
 795       if (CHAR_BYTE8_HEAD_P (c))
 796         break;
 797       p += len;
 798     }
 799   to = str + (p - str);
 800   while (p < endp)
 801     {
 802       c = *p;
 803       len = BYTES_BY_CHAR_HEAD (c);
 804       if (CHAR_BYTE8_HEAD_P (c))
 805         {
 806           c = STRING_CHAR_ADVANCE (p);
 807           *to++ = CHAR_TO_BYTE8 (c);
 808         }
 809       else
 810         {
 811           while (len--) *to++ = *p++;
 812         }
 813     }
 814   return (to - str);
 815 }
 816
 817 int
 818 string_count_byte8 (string)
 819      Lisp_Object string;
 820 {
 821   int multibyte = STRING_MULTIBYTE (string);
 822   int nbytes = SBYTES (string);
 823   unsigned char *p = SDATA (string);
 824   unsigned char *pend = p + nbytes;
 825   int count = 0;
 826   int c, len;
 827
 828   if (multibyte)
 829     while (p < pend)
 830       {
 831         c = *p;
 832         len = BYTES_BY_CHAR_HEAD (c);
 833
 834         if (CHAR_BYTE8_HEAD_P (c))
 835           count++;
 836         p += len;
 837       }
 838   else
 839     while (p < pend)
 840       {
 841         if (*p++ >= 0x80)
 842           count++;
 843       }
 844   return count;
 845 }
 846
 847
 848 Lisp_Object
 849 string_escape_byte8 (string)
 850      Lisp_Object string;
 851 {
 852   int nchars = SCHARS (string);
 853   int nbytes = SBYTES (string);
 854   int multibyte = STRING_MULTIBYTE (string);
 855   int byte8_count;
 856   const unsigned char *src, *src_end;
 857   unsigned char *dst;
 858   Lisp_Object val;
 859   int c, len;
 860
 861   if (multibyte && nchars == nbytes)
 862     return string;
 863
 864   byte8_count = string_count_byte8 (string);
 865
 866   if (byte8_count == 0)
 867     return string;
 868
 869   if (multibyte)
 870     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 871     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 872                                         nbytes + byte8_count * 2);
 873   else
 874     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 875     val = make_uninit_string (nbytes + byte8_count * 3);
 876
 877   src = SDATA (string);
 878   src_end = src + nbytes;
 879   dst = SDATA (val);
 880   if (multibyte)
 881     while (src < src_end)
 882       {
 883         c = *src;
 884         len = BYTES_BY_CHAR_HEAD (c);
 885
 886         if (CHAR_BYTE8_HEAD_P (c))
 887           {
 888             c = STRING_CHAR_ADVANCE (src);
 889             c = CHAR_TO_BYTE8 (c);
 890             sprintf ((char *) dst, "\\%03o", c);
 891             dst += 4;
 892           }
 893         else
 894           while (len--) *dst++ = *src++;
 895       }
 896   else
 897     while (src < src_end)
 898       {
 899         c = *src++;
 900         if (c >= 0x80)
 901           {
 902             sprintf ((char *) dst, "\\%03o", c);
 903             dst += 4;
 904           }
 905         else
 906           *dst++ = c;
 907       }
 908   return val;
 909 }
 910
 911 \f
 912 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 913        doc: /*
 914 Concatenate all the argument characters and make the result a string.
 915 usage: (string &rest CHARACTERS)  */)
 916      (n, args)
 917      int n;
 918      Lisp_Object *args;
 919 {
 920   int i;
 921   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 922   unsigned char *p = buf;
 923   int c;
 924
 925   for (i = 0; i < n; i++)
 926     {
 927       CHECK_CHARACTER (args[i]);
 928       c = XINT (args[i]);
 929       p += CHAR_STRING (c, p);
 930     }
 931
 932   return make_string_from_bytes ((char *) buf, n, p - buf);
 933 }
 934
 935 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 936        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 937 usage: (unibyte-string &rest BYTES)  */)
 938      (n, args)
 939      int n;
 940      Lisp_Object *args;
 941 {
 942   int i;
 943   unsigned char *buf = (unsigned char *) alloca (n);
 944   unsigned char *p = buf;
 945   unsigned c;
 946
 947   for (i = 0; i < n; i++)
 948     {
 949       CHECK_NATNUM (args[i]);
 950       c = XFASTINT (args[i]);
 951       if (c >= 256)
 952         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 953       *p++ = c;
 954     }
 955
 956   return make_string_from_bytes ((char *) buf, n, p - buf);
 957 }
 958
 959 void
 960 init_character_once ()
 961 {
 962 }
 963
 964 #ifdef emacs
 965
 966 void
 967 syms_of_character ()
 968 {
 969   DEFSYM (Qcharacterp, "characterp");
 970   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 971
 972   staticpro (&Vchar_unify_table);
 973   Vchar_unify_table = Qnil;
 974
 975   defsubr (&Smax_char);
 976   defsubr (&Scharacterp);
 977   defsubr (&Sunibyte_char_to_multibyte);
 978   defsubr (&Smultibyte_char_to_unibyte);
 979   defsubr (&Schar_bytes);
 980   defsubr (&Schar_width);
 981   defsubr (&Sstring_width);
 982   defsubr (&Schar_direction);
 983   defsubr (&Sstring);
 984   defsubr (&Sunibyte_string);
 985
 986   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 987                doc: /*
 988 Vector recording all translation tables ever defined.
 989 Each element is a pair (SYMBOL . TABLE) relating the table to the
 990 symbol naming it.  The ID of a translation table is an index into this vector.  */);
 991   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
 992
 993   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
 994                doc: /*
 995 A char-table for characters which invoke auto-filling.
 996 Such characters have value t in this table.  */);
 997   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
 998   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
 999   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1000
1001   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1002                doc: /*
1003 A char-table for width (columns) of each character.  */);
1004   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1005   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1006   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1007                         make_number (4));
1008
1009   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1010                doc: /* A char-table for direction of each character.  */);
1011   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1012
1013   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1014                doc: /* A char-table for each printable character.  */);
1015   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1016   Fset_char_table_range (Vprintable_chars,
1017                          Fcons (make_number (32), make_number (126)), Qt);
1018   Fset_char_table_range (Vprintable_chars,
1019                          Fcons (make_number (160),
1020                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1021
1022   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1023                doc: /* Char table of script symbols.
1024 It has one extra slot whose value is a list of script symbols.  */);
1025
1026   /* Intern this now in case it isn't already done.
1027      Setting this variable twice is harmless.
1028      But don't staticpro it here--that is done in alloc.c.  */
1029   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1030   DEFSYM (Qchar_script_table, "char-script-table");
1031   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1032   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1033
1034   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1035                doc: /* Alist of scripts vs the representative characters.  */);
1036   Vscript_representative_chars = Qnil;
1037 }
1038
1039 #endif /* emacs */
1040
1041 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1042    (do not change this comment) */