src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software; you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation; either version 3, or (at your option)
  15 any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs; see the file COPYING.  If not, write to the
  24 Free Software Foundation, Inc., 51 Franklin Street, Fifth Floor,
  25 Boston, MA 02110-1301, USA.  */
  26
  27 /* At first, see the document in `character.h' to understand the code
  28    in this file.  */
  29
  30 #ifdef emacs
  31 #include <config.h>
  32 #endif
  33
  34 #include <stdio.h>
  35
  36 #ifdef emacs
  37
  38 #include <sys/types.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 /* Vector of translation table ever defined.
  55    ID of a translation table is used to index this vector.  */
  56 Lisp_Object Vtranslation_table_vector;
  57
  58 /* A char-table for characters which may invoke auto-filling.  */
  59 Lisp_Object Vauto_fill_chars;
  60
  61 Lisp_Object Qauto_fill_chars;
  62
  63 /* Char-table of information about which character to unify to which
  64    Unicode character.  */
  65 Lisp_Object Vchar_unify_table;
  66
  67 /* A char-table.  An element is non-nil iff the corresponding
  68    character has a printable glyph.  */
  69 Lisp_Object Vprintable_chars;
  70
  71 /* A char-table.  An elemnent is a column-width of the corresponding
  72    character.  */
  73 Lisp_Object Vchar_width_table;
  74
  75 /* A char-table.  An element is a symbol indicating the direction
  76    property of corresponding character.  */
  77 Lisp_Object Vchar_direction_table;
  78
  79 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  80 unsigned char *_fetch_multibyte_char_p;
  81
  82 /* Char table of scripts.  */
  83 Lisp_Object Vchar_script_table;
  84
  85 /* Alist of scripts vs representative characters.  */
  86 Lisp_Object Vscript_representative_chars;
  87
  88 static Lisp_Object Qchar_script_table;
  89
  90 /* Mapping table from unibyte chars to multibyte chars.  */
  91 int unibyte_to_multibyte_table[256];
  92
  93 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  94    char.  */
  95 char unibyte_has_multibyte_table[256];
  96
  97 \f
  98
  99 /* If character code C has modifier masks, reflect them to the
 100    character code if possible.  Return the resulting code.  */
 101
 102 int
 103 char_resolve_modifier_mask (c)
 104      int c;
 105 {
 106   /* An non-ASCII character can't reflect modifier bits to the code.  */
 107   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 108     return c;
 109
 110   /* For Meta, Shift, and Control modifiers, we need special care.  */
 111   if (c & CHAR_META)
 112     {
 113       /* Move the meta bit to the right place for a string.  */
 114       c = (c & ~CHAR_META) | 0x80;
 115     }
 116   if (c & CHAR_SHIFT)
 117     {
 118       /* Shift modifier is valid only with [A-Za-z].  */
 119       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 120         c &= ~CHAR_SHIFT;
 121       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 122         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 123     }
 124   if (c & CHAR_CTL)
 125     {
 126       /* Simulate the code in lread.c.  */
 127       /* Allow `\C- ' and `\C-?'.  */
 128       if ((c & ~CHAR_CTL) == ' ')
 129         c = 0;
 130       else if ((c & ~CHAR_CTL) == '?')
 131         c = 127;
 132       /* ASCII control chars are made from letters (both cases),
 133          as well as the non-letters within 0100...0137.  */
 134       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 135         c &= (037 | (~0177 & ~CHAR_CTL));
 136       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 137         c &= (037 | (~0177 & ~CHAR_CTL));
 138     }
 139
 140   return c;
 141 }
 142
 143
 144 /* Store multibyte form of character C at P.  If C has modifier bits,
 145    handle them appropriately.  */
 146
 147 int
 148 char_string (c, p)
 149      unsigned c;
 150      unsigned char *p;
 151 {
 152   int bytes;
 153
 154   if (c & CHAR_MODIFIER_MASK)
 155     {
 156       c = (unsigned) char_resolve_modifier_mask ((int) c);
 157       /* If C still has any modifier bits, just ignore it.  */
 158       c &= ~CHAR_MODIFIER_MASK;
 159     }
 160
 161   MAYBE_UNIFY_CHAR (c);
 162
 163   if (c <= MAX_3_BYTE_CHAR)
 164     {
 165       bytes = CHAR_STRING (c, p);
 166     }
 167   else if (c <= MAX_4_BYTE_CHAR)
 168     {
 169       p[0] = (0xF0 | (c >> 18));
 170       p[1] = (0x80 | ((c >> 12) & 0x3F));
 171       p[2] = (0x80 | ((c >> 6) & 0x3F));
 172       p[3] = (0x80 | (c & 0x3F));
 173       bytes = 4;
 174     }
 175   else if (c <= MAX_5_BYTE_CHAR)
 176     {
 177       p[0] = 0xF8;
 178       p[1] = (0x80 | ((c >> 18) & 0x0F));
 179       p[2] = (0x80 | ((c >> 12) & 0x3F));
 180       p[3] = (0x80 | ((c >> 6) & 0x3F));
 181       p[4] = (0x80 | (c & 0x3F));
 182       bytes = 5;
 183     }
 184   else if (c <= MAX_CHAR)
 185     {
 186       c = CHAR_TO_BYTE8 (c);
 187       bytes = BYTE8_STRING (c, p);
 188     }
 189   else
 190     error ("Invalid character: %d", c);
 191
 192   return bytes;
 193 }
 194
 195
 196 /* Return a character whose multibyte form is at P.  Set LEN is not
 197    NULL, it must be a pointer to integer.  In that case, set *LEN to
 198    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 199    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 200    the ending address (i.e. the starting address of the next
 201    character) of the multibyte form.  */
 202
 203 int
 204 string_char (p, advanced, len)
 205      const unsigned char *p;
 206      const unsigned char **advanced;
 207      int *len;
 208 {
 209   int c;
 210   const unsigned char *saved_p = p;
 211
 212   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 213     {
 214       c = STRING_CHAR_ADVANCE (p);
 215     }
 216   else if (! (*p & 0x08))
 217     {
 218       c = ((((p)[0] & 0xF) << 18)
 219            | (((p)[1] & 0x3F) << 12)
 220            | (((p)[2] & 0x3F) << 6)
 221            | ((p)[3] & 0x3F));
 222       p += 4;
 223     }
 224   else
 225     {
 226       c = ((((p)[1] & 0x3F) << 18)
 227            | (((p)[2] & 0x3F) << 12)
 228            | (((p)[3] & 0x3F) << 6)
 229            | ((p)[4] & 0x3F));
 230       p += 5;
 231     }
 232
 233   MAYBE_UNIFY_CHAR (c);
 234
 235   if (len)
 236     *len = p - saved_p;
 237   if (advanced)
 238     *advanced = p;
 239   return c;
 240 }
 241
 242
 243 /* Translate character C by translation table TABLE.  If C is
 244    negative, translate a character specified by CHARSET and CODE.  If
 245    no translation is found in TABLE, return the untranslated
 246    character.  If TABLE is a list, elements are char tables.  In this
 247    case, translace C by all tables.  */
 248
 249 int
 250 translate_char (table, c)
 251      Lisp_Object table;
 252      int c;
 253 {
 254   if (CHAR_TABLE_P (table))
 255     {
 256       Lisp_Object ch;
 257
 258       ch = CHAR_TABLE_REF (table, c);
 259       if (CHARACTERP (ch))
 260         c = XINT (ch);
 261     }
 262   else
 263     {
 264       for (; CONSP (table); table = XCDR (table))
 265         c = translate_char (XCAR (table), c);
 266     }
 267   return c;
 268 }
 269
 270 /* Convert the multibyte character C to unibyte 8-bit character based
 271    on the current value of charset_unibyte.  If dimension of
 272    charset_unibyte is more than one, return (C & 0xFF).
 273
 274    The argument REV_TBL is now ignored.  It will be removed in the
 275    future.  */
 276
 277 int
 278 multibyte_char_to_unibyte (c, rev_tbl)
 279      int c;
 280      Lisp_Object rev_tbl;
 281 {
 282   struct charset *charset;
 283   unsigned c1;
 284
 285   if (CHAR_BYTE8_P (c))
 286     return CHAR_TO_BYTE8 (c);
 287   charset = CHARSET_FROM_ID (charset_unibyte);
 288   c1 = ENCODE_CHAR (charset, c);
 289   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 290 }
 291
 292 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 293    by charset_unibyte.  */
 294
 295 int
 296 multibyte_char_to_unibyte_safe (c)
 297      int c;
 298 {
 299   struct charset *charset;
 300   unsigned c1;
 301
 302   if (CHAR_BYTE8_P (c))
 303     return CHAR_TO_BYTE8 (c);
 304   charset = CHARSET_FROM_ID (charset_unibyte);
 305   c1 = ENCODE_CHAR (charset, c);
 306   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 307 }
 308
 309 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 310        doc: /* Return non-nil if OBJECT is a character.  */)
 311      (object, ignore)
 312      Lisp_Object object, ignore;
 313 {
 314   return (CHARACTERP (object) ? Qt : Qnil);
 315 }
 316
 317 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 318        doc: /* Return the character of the maximum code.  */)
 319      ()
 320 {
 321   return make_number (MAX_CHAR);
 322 }
 323
 324 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 325        Sunibyte_char_to_multibyte, 1, 1, 0,
 326        doc: /* Convert the unibyte character CH to multibyte character.
 327 The multibyte character is a result of decoding CH by
 328 the current unibyte charset (see `unibyte-charset').  */)
 329      (ch)
 330      Lisp_Object ch;
 331 {
 332   int c;
 333   struct charset *charset;
 334
 335   CHECK_CHARACTER (ch);
 336   c = XFASTINT (ch);
 337   if (c >= 0400)
 338     error ("Invalid unibyte character: %d", c);
 339   charset = CHARSET_FROM_ID (charset_unibyte);
 340   c = DECODE_CHAR (charset, c);
 341   if (c < 0)
 342     c = BYTE8_TO_CHAR (XFASTINT (ch));
 343   return make_number (c);
 344 }
 345
 346 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 347        Smultibyte_char_to_unibyte, 1, 1, 0,
 348        doc: /* Convert the multibyte character CH to unibyte character.\n\
 349 The unibyte character is a result of encoding CH by
 350 the current primary charset (value of `charset-primary').  */)
 351      (ch)
 352      Lisp_Object ch;
 353 {
 354   int c;
 355
 356   CHECK_CHARACTER (ch);
 357   c = XFASTINT (ch);
 358   c = CHAR_TO_BYTE8 (c);
 359   return make_number (c);
 360 }
 361
 362 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 363        doc: /* Return 1 regardless of the argument CHAR.
 364 This is now an obsolete function.  We keep it just for backward compatibility.   */)
 365      (ch)
 366      Lisp_Object ch;
 367 {
 368   CHECK_CHARACTER (ch);
 369   return make_number (1);
 370 }
 371
 372 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 373        doc: /* Return width of CHAR when displayed in the current buffer.
 374 The width is measured by how many columns it occupies on the screen.
 375 Tab is taken to occupy `tab-width' columns.  */)
 376      (ch)
 377        Lisp_Object ch;
 378 {
 379   Lisp_Object disp;
 380   int c, width;
 381   struct Lisp_Char_Table *dp = buffer_display_table ();
 382
 383   CHECK_CHARACTER (ch);
 384   c = XINT (ch);
 385
 386   /* Get the way the display table would display it.  */
 387   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 388
 389   if (VECTORP (disp))
 390     width = ASIZE (disp);
 391   else
 392     width = CHAR_WIDTH (c);
 393
 394   return make_number (width);
 395 }
 396
 397 /* Return width of string STR of length LEN when displayed in the
 398    current buffer.  The width is measured by how many columns it
 399    occupies on the screen.  If PRECISION > 0, return the width of
 400    longest substring that doesn't exceed PRECISION, and set number of
 401    characters and bytes of the substring in *NCHARS and *NBYTES
 402    respectively.  */
 403
 404 int
 405 c_string_width (str, len, precision, nchars, nbytes)
 406      const unsigned char *str;
 407      int precision, *nchars, *nbytes;
 408 {
 409   int i = 0, i_byte = 0;
 410   int width = 0;
 411   struct Lisp_Char_Table *dp = buffer_display_table ();
 412
 413   while (i_byte < len)
 414     {
 415       int bytes, thiswidth;
 416       Lisp_Object val;
 417       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 418
 419       if (dp)
 420         {
 421           val = DISP_CHAR_VECTOR (dp, c);
 422           if (VECTORP (val))
 423             thiswidth = XVECTOR (val)->size;
 424           else
 425             thiswidth = CHAR_WIDTH (c);
 426         }
 427       else
 428         {
 429           thiswidth = CHAR_WIDTH (c);
 430         }
 431
 432       if (precision > 0
 433           && (width + thiswidth > precision))
 434         {
 435           *nchars = i;
 436           *nbytes = i_byte;
 437           return width;
 438         }
 439       i++;
 440       i_byte += bytes;
 441       width += thiswidth;
 442   }
 443
 444   if (precision > 0)
 445     {
 446       *nchars = i;
 447       *nbytes = i_byte;
 448     }
 449
 450   return width;
 451 }
 452
 453 /* Return width of string STR of length LEN when displayed in the
 454    current buffer.  The width is measured by how many columns it
 455    occupies on the screen.  */
 456
 457 int
 458 strwidth (str, len)
 459      unsigned char *str;
 460      int len;
 461 {
 462   return c_string_width (str, len, -1, NULL, NULL);
 463 }
 464
 465 /* Return width of Lisp string STRING when displayed in the current
 466    buffer.  The width is measured by how many columns it occupies on
 467    the screen while paying attention to compositions.  If PRECISION >
 468    0, return the width of longest substring that doesn't exceed
 469    PRECISION, and set number of characters and bytes of the substring
 470    in *NCHARS and *NBYTES respectively.  */
 471
 472 int
 473 lisp_string_width (string, precision, nchars, nbytes)
 474      Lisp_Object string;
 475      int precision, *nchars, *nbytes;
 476 {
 477   int len = SCHARS (string);
 478   /* This set multibyte to 0 even if STRING is multibyte when it
 479      contains only ascii and eight-bit-graphic, but that's
 480      intentional.  */
 481   int multibyte = len < SBYTES (string);
 482   unsigned char *str = SDATA (string);
 483   int i = 0, i_byte = 0;
 484   int width = 0;
 485   struct Lisp_Char_Table *dp = buffer_display_table ();
 486
 487   while (i < len)
 488     {
 489       int chars, bytes, thiswidth;
 490       Lisp_Object val;
 491       int cmp_id;
 492       EMACS_INT ignore, end;
 493
 494       if (find_composition (i, -1, &ignore, &end, &val, string)
 495           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 496               >= 0))
 497         {
 498           thiswidth = composition_table[cmp_id]->width;
 499           chars = end - i;
 500           bytes = string_char_to_byte (string, end) - i_byte;
 501         }
 502       else
 503         {
 504           int c;
 505
 506           if (multibyte)
 507             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 508           else
 509             c = str[i_byte], bytes = 1;
 510           chars = 1;
 511           if (dp)
 512             {
 513               val = DISP_CHAR_VECTOR (dp, c);
 514               if (VECTORP (val))
 515                 thiswidth = XVECTOR (val)->size;
 516               else
 517                 thiswidth = CHAR_WIDTH (c);
 518             }
 519           else
 520             {
 521               thiswidth = CHAR_WIDTH (c);
 522             }
 523         }
 524
 525       if (precision > 0
 526           && (width + thiswidth > precision))
 527         {
 528           *nchars = i;
 529           *nbytes = i_byte;
 530           return width;
 531         }
 532       i += chars;
 533       i_byte += bytes;
 534       width += thiswidth;
 535   }
 536
 537   if (precision > 0)
 538     {
 539       *nchars = i;
 540       *nbytes = i_byte;
 541     }
 542
 543   return width;
 544 }
 545
 546 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 547        doc: /* Return width of STRING when displayed in the current buffer.
 548 Width is measured by how many columns it occupies on the screen.
 549 When calculating width of a multibyte character in STRING,
 550 only the base leading-code is considered; the validity of
 551 the following bytes is not checked.  Tabs in STRING are always
 552 taken to occupy `tab-width' columns.  */)
 553      (str)
 554      Lisp_Object str;
 555 {
 556   Lisp_Object val;
 557
 558   CHECK_STRING (str);
 559   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 560   return val;
 561 }
 562
 563 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 564        doc: /* Return the direction of CHAR.
 565 The returned value is 0 for left-to-right and 1 for right-to-left.  */)
 566      (ch)
 567      Lisp_Object ch;
 568 {
 569   int c;
 570
 571   CHECK_CHARACTER (ch);
 572   c = XINT (ch);
 573   return CHAR_TABLE_REF (Vchar_direction_table, c);
 574 }
 575
 576 /* Return the number of characters in the NBYTES bytes at PTR.
 577    This works by looking at the contents and checking for multibyte
 578    sequences while assuming that there's no invalid sequence.
 579    However, if the current buffer has enable-multibyte-characters =
 580    nil, we treat each byte as a character.  */
 581
 582 EMACS_INT
 583 chars_in_text (ptr, nbytes)
 584      const unsigned char *ptr;
 585      EMACS_INT nbytes;
 586 {
 587   /* current_buffer is null at early stages of Emacs initialization.  */
 588   if (current_buffer == 0
 589       || NILP (current_buffer->enable_multibyte_characters))
 590     return nbytes;
 591
 592   return multibyte_chars_in_text (ptr, nbytes);
 593 }
 594
 595 /* Return the number of characters in the NBYTES bytes at PTR.
 596    This works by looking at the contents and checking for multibyte
 597    sequences while assuming that there's no invalid sequence.  It
 598    ignores enable-multibyte-characters.  */
 599
 600 EMACS_INT
 601 multibyte_chars_in_text (ptr, nbytes)
 602      const unsigned char *ptr;
 603      EMACS_INT nbytes;
 604 {
 605   const unsigned char *endp = ptr + nbytes;
 606   int chars = 0;
 607
 608   while (ptr < endp)
 609     {
 610       int len = MULTIBYTE_LENGTH (ptr, endp);
 611
 612       if (len == 0)
 613         abort ();
 614       ptr += len;
 615       chars++;
 616     }
 617
 618   return chars;
 619 }
 620
 621 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 622    characters and bytes in it, and store them in *NCHARS and *NBYTES
 623    respectively.  On counting bytes, pay attention to that 8-bit
 624    characters not constructing a valid multibyte sequence are
 625    represented by 2-byte in a multibyte text.  */
 626
 627 void
 628 parse_str_as_multibyte (str, len, nchars, nbytes)
 629      const unsigned char *str;
 630      int len, *nchars, *nbytes;
 631 {
 632   const unsigned char *endp = str + len;
 633   int n, chars = 0, bytes = 0;
 634
 635   if (len >= MAX_MULTIBYTE_LENGTH)
 636     {
 637       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 638       while (str < adjusted_endp)
 639         {
 640           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 641             str += n, bytes += n;
 642           else
 643             str++, bytes += 2;
 644           chars++;
 645         }
 646     }
 647   while (str < endp)
 648     {
 649       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 650         str += n, bytes += n;
 651       else
 652         str++, bytes += 2;
 653       chars++;
 654     }
 655
 656   *nchars = chars;
 657   *nbytes = bytes;
 658   return;
 659 }
 660
 661 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 662    It actually converts only such 8-bit characters that don't contruct
 663    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 664    NCHARS is nonzero, set *NCHARS to the number of characters in the
 665    text.  It is assured that we can use LEN bytes at STR as a work
 666    area and that is enough.  Return the number of bytes of the
 667    resulting text.  */
 668
 669 int
 670 str_as_multibyte (str, len, nbytes, nchars)
 671      unsigned char *str;
 672      int len, nbytes, *nchars;
 673 {
 674   unsigned char *p = str, *endp = str + nbytes;
 675   unsigned char *to;
 676   int chars = 0;
 677   int n;
 678
 679   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 680     {
 681       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 682       while (p < adjusted_endp
 683              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 684         p += n, chars++;
 685     }
 686   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 687     p += n, chars++;
 688   if (nchars)
 689     *nchars = chars;
 690   if (p == endp)
 691     return nbytes;
 692
 693   to = p;
 694   nbytes = endp - p;
 695   endp = str + len;
 696   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 697   p = endp - nbytes;
 698
 699   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 700     {
 701       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 702       while (p < adjusted_endp)
 703         {
 704           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 705             {
 706               while (n--)
 707                 *to++ = *p++;
 708             }
 709           else
 710             {
 711               int c = *p++;
 712               c = BYTE8_TO_CHAR (c);
 713               to += CHAR_STRING (c, to);
 714             }
 715         }
 716       chars++;
 717     }
 718   while (p < endp)
 719     {
 720       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 721         {
 722           while (n--)
 723             *to++ = *p++;
 724         }
 725       else
 726         {
 727           int c = *p++;
 728           c = BYTE8_TO_CHAR (c);
 729           to += CHAR_STRING (c, to);
 730         }
 731       chars++;
 732     }
 733   if (nchars)
 734     *nchars = chars;
 735   return (to - str);
 736 }
 737
 738 /* Parse unibyte string at STR of LEN bytes, and return the number of
 739    bytes it may ocupy when converted to multibyte string by
 740    `str_to_multibyte'.  */
 741
 742 int
 743 parse_str_to_multibyte (str, len)
 744      unsigned char *str;
 745      int len;
 746 {
 747   unsigned char *endp = str + len;
 748   int bytes;
 749
 750   for (bytes = 0; str < endp; str++)
 751     bytes += (*str < 0x80) ? 1 : 2;
 752   return bytes;
 753 }
 754
 755
 756 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 757    that contains the same single-byte characters.  It actually
 758    converts all 8-bit characters to multibyte forms.  It is assured
 759    that we can use LEN bytes at STR as a work area and that is
 760    enough.  */
 761
 762 int
 763 str_to_multibyte (str, len, bytes)
 764      unsigned char *str;
 765      int len, bytes;
 766 {
 767   unsigned char *p = str, *endp = str + bytes;
 768   unsigned char *to;
 769
 770   while (p < endp && *p < 0x80) p++;
 771   if (p == endp)
 772     return bytes;
 773   to = p;
 774   bytes = endp - p;
 775   endp = str + len;
 776   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 777   p = endp - bytes;
 778   while (p < endp)
 779     {
 780       int c = *p++;
 781
 782       if (c >= 0x80)
 783         c = BYTE8_TO_CHAR (c);
 784       to += CHAR_STRING (c, to);
 785     }
 786   return (to - str);
 787 }
 788
 789 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 790    actually converts characters in the range 0x80..0xFF to
 791    unibyte.  */
 792
 793 int
 794 str_as_unibyte (str, bytes)
 795      unsigned char *str;
 796      int bytes;
 797 {
 798   const unsigned char *p = str, *endp = str + bytes;
 799   unsigned char *to;
 800   int c, len;
 801
 802   while (p < endp)
 803     {
 804       c = *p;
 805       len = BYTES_BY_CHAR_HEAD (c);
 806       if (CHAR_BYTE8_HEAD_P (c))
 807         break;
 808       p += len;
 809     }
 810   to = str + (p - str);
 811   while (p < endp)
 812     {
 813       c = *p;
 814       len = BYTES_BY_CHAR_HEAD (c);
 815       if (CHAR_BYTE8_HEAD_P (c))
 816         {
 817           c = STRING_CHAR_ADVANCE (p);
 818           *to++ = CHAR_TO_BYTE8 (c);
 819         }
 820       else
 821         {
 822           while (len--) *to++ = *p++;
 823         }
 824     }
 825   return (to - str);
 826 }
 827
 828 int
 829 string_count_byte8 (string)
 830      Lisp_Object string;
 831 {
 832   int multibyte = STRING_MULTIBYTE (string);
 833   int nbytes = SBYTES (string);
 834   unsigned char *p = SDATA (string);
 835   unsigned char *pend = p + nbytes;
 836   int count = 0;
 837   int c, len;
 838
 839   if (multibyte)
 840     while (p < pend)
 841       {
 842         c = *p;
 843         len = BYTES_BY_CHAR_HEAD (c);
 844
 845         if (CHAR_BYTE8_HEAD_P (c))
 846           count++;
 847         p += len;
 848       }
 849   else
 850     while (p < pend)
 851       {
 852         if (*p++ >= 0x80)
 853           count++;
 854       }
 855   return count;
 856 }
 857
 858
 859 Lisp_Object
 860 string_escape_byte8 (string)
 861      Lisp_Object string;
 862 {
 863   int nchars = SCHARS (string);
 864   int nbytes = SBYTES (string);
 865   int multibyte = STRING_MULTIBYTE (string);
 866   int byte8_count;
 867   const unsigned char *src, *src_end;
 868   unsigned char *dst;
 869   Lisp_Object val;
 870   int c, len;
 871
 872   if (multibyte && nchars == nbytes)
 873     return string;
 874
 875   byte8_count = string_count_byte8 (string);
 876
 877   if (byte8_count == 0)
 878     return string;
 879
 880   if (multibyte)
 881     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 882     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 883                                         nbytes + byte8_count * 2);
 884   else
 885     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 886     val = make_uninit_string (nbytes + byte8_count * 3);
 887
 888   src = SDATA (string);
 889   src_end = src + nbytes;
 890   dst = SDATA (val);
 891   if (multibyte)
 892     while (src < src_end)
 893       {
 894         c = *src;
 895         len = BYTES_BY_CHAR_HEAD (c);
 896
 897         if (CHAR_BYTE8_HEAD_P (c))
 898           {
 899             c = STRING_CHAR_ADVANCE (src);
 900             c = CHAR_TO_BYTE8 (c);
 901             sprintf ((char *) dst, "\\%03o", c);
 902             dst += 4;
 903           }
 904         else
 905           while (len--) *dst++ = *src++;
 906       }
 907   else
 908     while (src < src_end)
 909       {
 910         c = *src++;
 911         if (c >= 0x80)
 912           {
 913             sprintf ((char *) dst, "\\%03o", c);
 914             dst += 4;
 915           }
 916         else
 917           *dst++ = c;
 918       }
 919   return val;
 920 }
 921
 922 \f
 923 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 924        doc: /*
 925 Concatenate all the argument characters and make the result a string.
 926 usage: (string &rest CHARACTERS)  */)
 927      (n, args)
 928      int n;
 929      Lisp_Object *args;
 930 {
 931   int i;
 932   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 933   unsigned char *p = buf;
 934   int c;
 935
 936   for (i = 0; i < n; i++)
 937     {
 938       CHECK_CHARACTER (args[i]);
 939       c = XINT (args[i]);
 940       p += CHAR_STRING (c, p);
 941     }
 942
 943   return make_string_from_bytes ((char *) buf, n, p - buf);
 944 }
 945
 946 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 947        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 948 usage: (unibyte-string &rest BYTES)  */)
 949      (n, args)
 950      int n;
 951      Lisp_Object *args;
 952 {
 953   int i;
 954   unsigned char *buf = (unsigned char *) alloca (n);
 955   unsigned char *p = buf;
 956   unsigned c;
 957
 958   for (i = 0; i < n; i++)
 959     {
 960       CHECK_NATNUM (args[i]);
 961       c = XFASTINT (args[i]);
 962       if (c >= 256)
 963         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 964       *p++ = c;
 965     }
 966
 967   return make_string_from_bytes ((char *) buf, n, p - buf);
 968 }
 969
 970 void
 971 init_character_once ()
 972 {
 973 }
 974
 975 #ifdef emacs
 976
 977 void
 978 syms_of_character ()
 979 {
 980   DEFSYM (Qcharacterp, "characterp");
 981   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
 982
 983   staticpro (&Vchar_unify_table);
 984   Vchar_unify_table = Qnil;
 985
 986   defsubr (&Smax_char);
 987   defsubr (&Scharacterp);
 988   defsubr (&Sunibyte_char_to_multibyte);
 989   defsubr (&Smultibyte_char_to_unibyte);
 990   defsubr (&Schar_bytes);
 991   defsubr (&Schar_width);
 992   defsubr (&Sstring_width);
 993   defsubr (&Schar_direction);
 994   defsubr (&Sstring);
 995   defsubr (&Sunibyte_string);
 996
 997   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
 998                doc: /*
 999 Vector recording all translation tables ever defined.
1000 Each element is a pair (SYMBOL . TABLE) relating the table to the
1001 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1002   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1003
1004   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1005                doc: /*
1006 A char-table for characters which invoke auto-filling.
1007 Such characters have value t in this table.  */);
1008   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1009   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1010   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1011
1012   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1013                doc: /*
1014 A char-table for width (columns) of each character.  */);
1015   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1016   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1017   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1018                         make_number (4));
1019
1020   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1021                doc: /* A char-table for direction of each character.  */);
1022   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1023
1024   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1025                doc: /* A char-table for each printable character.  */);
1026   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1027   Fset_char_table_range (Vprintable_chars,
1028                          Fcons (make_number (32), make_number (126)), Qt);
1029   Fset_char_table_range (Vprintable_chars,
1030                          Fcons (make_number (160),
1031                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1032
1033   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1034                doc: /* Char table of script symbols.
1035 It has one extra slot whose value is a list of script symbols.  */);
1036
1037   /* Intern this now in case it isn't already done.
1038      Setting this variable twice is harmless.
1039      But don't staticpro it here--that is done in alloc.c.  */
1040   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1041   DEFSYM (Qchar_script_table, "char-script-table");
1042   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1043   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1044
1045   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1046                doc: /* Alist of scripts vs the representative characters.  */);
1047   Vscript_representative_chars = Qnil;
1048 }
1049
1050 #endif /* emacs */
1051
1052 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1053    (do not change this comment) */