src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include "lisp.h"
  38 #include "character.h"
  39 #include "buffer.h"
  40 #include "charset.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 Lisp_Object Qcharacterp;
  51
  52 /* Vector of translation table ever defined.
  53    ID of a translation table is used to index this vector.  */
  54 Lisp_Object Vtranslation_table_vector;
  55
  56 /* A char-table for characters which may invoke auto-filling.  */
  57 Lisp_Object Vauto_fill_chars;
  58
  59 Lisp_Object Qauto_fill_chars;
  60
  61 /* Char-table of information about which character to unify to which
  62    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  63 Lisp_Object Vchar_unify_table;
  64
  65 /* A char-table.  An element is non-nil iff the corresponding
  66    character has a printable glyph.  */
  67 Lisp_Object Vprintable_chars;
  68
  69 /* A char-table.  An elemnent is a column-width of the corresponding
  70    character.  */
  71 Lisp_Object Vchar_width_table;
  72
  73 /* A char-table.  An element is a symbol indicating the direction
  74    property of corresponding character.  */
  75 Lisp_Object Vchar_direction_table;
  76
  77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  78 unsigned char *_fetch_multibyte_char_p;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 /* Alist of scripts vs representative characters.  */
  84 Lisp_Object Vscript_representative_chars;
  85
  86 static Lisp_Object Qchar_script_table;
  87
  88 Lisp_Object Vunicode_category_table;
  89
  90 /* Mapping table from unibyte chars to multibyte chars.  */
  91 int unibyte_to_multibyte_table[256];
  92
  93 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  94    char.  */
  95 char unibyte_has_multibyte_table[256];
  96
  97 \f
  98
  99 /* If character code C has modifier masks, reflect them to the
 100    character code if possible.  Return the resulting code.  */
 101
 102 int
 103 char_resolve_modifier_mask (c)
 104      int c;
 105 {
 106   /* A non-ASCII character can't reflect modifier bits to the code.  */
 107   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 108     return c;
 109
 110   /* For Meta, Shift, and Control modifiers, we need special care.  */
 111   if (c & CHAR_SHIFT)
 112     {
 113       /* Shift modifier is valid only with [A-Za-z].  */
 114       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 115         c &= ~CHAR_SHIFT;
 116       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 117         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 118       /* Shift modifier for control characters and SPC is ignored.  */
 119       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 120         c &= ~CHAR_SHIFT;
 121     }
 122   if (c & CHAR_CTL)
 123     {
 124       /* Simulate the code in lread.c.  */
 125       /* Allow `\C- ' and `\C-?'.  */
 126       if ((c & 0377) == ' ')
 127         c &= ~0177 & ~ CHAR_CTL;
 128       else if ((c & 0377) == '?')
 129         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 130       /* ASCII control chars are made from letters (both cases),
 131          as well as the non-letters within 0100...0137.  */
 132       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 133         c &= (037 | (~0177 & ~CHAR_CTL));
 134       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 135         c &= (037 | (~0177 & ~CHAR_CTL));
 136     }
 137   if (c & CHAR_META)
 138     {
 139       /* Move the meta bit to the right place for a string.  */
 140       c = (c & ~CHAR_META) | 0x80;
 141     }
 142
 143   return c;
 144 }
 145
 146
 147 /* Store multibyte form of character C at P.  If C has modifier bits,
 148    handle them appropriately.  */
 149
 150 int
 151 char_string (c, p)
 152      unsigned c;
 153      unsigned char *p;
 154 {
 155   int bytes;
 156
 157   if (c & CHAR_MODIFIER_MASK)
 158     {
 159       c = (unsigned) char_resolve_modifier_mask ((int) c);
 160       /* If C still has any modifier bits, just ignore it.  */
 161       c &= ~CHAR_MODIFIER_MASK;
 162     }
 163
 164   MAYBE_UNIFY_CHAR (c);
 165
 166   if (c <= MAX_3_BYTE_CHAR)
 167     {
 168       bytes = CHAR_STRING (c, p);
 169     }
 170   else if (c <= MAX_4_BYTE_CHAR)
 171     {
 172       p[0] = (0xF0 | (c >> 18));
 173       p[1] = (0x80 | ((c >> 12) & 0x3F));
 174       p[2] = (0x80 | ((c >> 6) & 0x3F));
 175       p[3] = (0x80 | (c & 0x3F));
 176       bytes = 4;
 177     }
 178   else if (c <= MAX_5_BYTE_CHAR)
 179     {
 180       p[0] = 0xF8;
 181       p[1] = (0x80 | ((c >> 18) & 0x0F));
 182       p[2] = (0x80 | ((c >> 12) & 0x3F));
 183       p[3] = (0x80 | ((c >> 6) & 0x3F));
 184       p[4] = (0x80 | (c & 0x3F));
 185       bytes = 5;
 186     }
 187   else if (c <= MAX_CHAR)
 188     {
 189       c = CHAR_TO_BYTE8 (c);
 190       bytes = BYTE8_STRING (c, p);
 191     }
 192   else
 193     error ("Invalid character: %d", c);
 194
 195   return bytes;
 196 }
 197
 198
 199 /* Return a character whose multibyte form is at P.  Set LEN is not
 200    NULL, it must be a pointer to integer.  In that case, set *LEN to
 201    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 202    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 203    the ending address (i.e. the starting address of the next
 204    character) of the multibyte form.  */
 205
 206 int
 207 string_char (p, advanced, len)
 208      const unsigned char *p;
 209      const unsigned char **advanced;
 210      int *len;
 211 {
 212   int c;
 213   const unsigned char *saved_p = p;
 214
 215   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 216     {
 217       c = STRING_CHAR_ADVANCE (p);
 218     }
 219   else if (! (*p & 0x08))
 220     {
 221       c = ((((p)[0] & 0xF) << 18)
 222            | (((p)[1] & 0x3F) << 12)
 223            | (((p)[2] & 0x3F) << 6)
 224            | ((p)[3] & 0x3F));
 225       p += 4;
 226     }
 227   else
 228     {
 229       c = ((((p)[1] & 0x3F) << 18)
 230            | (((p)[2] & 0x3F) << 12)
 231            | (((p)[3] & 0x3F) << 6)
 232            | ((p)[4] & 0x3F));
 233       p += 5;
 234     }
 235
 236   MAYBE_UNIFY_CHAR (c);
 237
 238   if (len)
 239     *len = p - saved_p;
 240   if (advanced)
 241     *advanced = p;
 242   return c;
 243 }
 244
 245
 246 /* Translate character C by translation table TABLE.  If C is
 247    negative, translate a character specified by CHARSET and CODE.  If
 248    no translation is found in TABLE, return the untranslated
 249    character.  If TABLE is a list, elements are char tables.  In this
 250    case, translace C by all tables.  */
 251
 252 int
 253 translate_char (table, c)
 254      Lisp_Object table;
 255      int c;
 256 {
 257   if (CHAR_TABLE_P (table))
 258     {
 259       Lisp_Object ch;
 260
 261       ch = CHAR_TABLE_REF (table, c);
 262       if (CHARACTERP (ch))
 263         c = XINT (ch);
 264     }
 265   else
 266     {
 267       for (; CONSP (table); table = XCDR (table))
 268         c = translate_char (XCAR (table), c);
 269     }
 270   return c;
 271 }
 272
 273 /* Convert the multibyte character C to unibyte 8-bit character based
 274    on the current value of charset_unibyte.  If dimension of
 275    charset_unibyte is more than one, return (C & 0xFF).
 276
 277    The argument REV_TBL is now ignored.  It will be removed in the
 278    future.  */
 279
 280 int
 281 multibyte_char_to_unibyte (c, rev_tbl)
 282      int c;
 283      Lisp_Object rev_tbl;
 284 {
 285   struct charset *charset;
 286   unsigned c1;
 287
 288   if (CHAR_BYTE8_P (c))
 289     return CHAR_TO_BYTE8 (c);
 290   charset = CHARSET_FROM_ID (charset_unibyte);
 291   c1 = ENCODE_CHAR (charset, c);
 292   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 293 }
 294
 295 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 296    by charset_unibyte.  */
 297
 298 int
 299 multibyte_char_to_unibyte_safe (c)
 300      int c;
 301 {
 302   struct charset *charset;
 303   unsigned c1;
 304
 305   if (CHAR_BYTE8_P (c))
 306     return CHAR_TO_BYTE8 (c);
 307   charset = CHARSET_FROM_ID (charset_unibyte);
 308   c1 = ENCODE_CHAR (charset, c);
 309   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 310 }
 311
 312 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 313        doc: /* Return non-nil if OBJECT is a character.  */)
 314      (object, ignore)
 315      Lisp_Object object, ignore;
 316 {
 317   return (CHARACTERP (object) ? Qt : Qnil);
 318 }
 319
 320 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 321        doc: /* Return the character of the maximum code.  */)
 322      ()
 323 {
 324   return make_number (MAX_CHAR);
 325 }
 326
 327 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 328        Sunibyte_char_to_multibyte, 1, 1, 0,
 329        doc: /* Convert the byte CH to multibyte character.  */)
 330      (ch)
 331      Lisp_Object ch;
 332 {
 333   int c;
 334   struct charset *charset;
 335
 336   CHECK_CHARACTER (ch);
 337   c = XFASTINT (ch);
 338   if (c >= 0400)
 339     error ("Invalid unibyte character: %d", c);
 340   charset = CHARSET_FROM_ID (charset_unibyte);
 341   c = DECODE_CHAR (charset, c);
 342   if (c < 0)
 343     c = BYTE8_TO_CHAR (XFASTINT (ch));
 344   return make_number (c);
 345 }
 346
 347 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 348        Smultibyte_char_to_unibyte, 1, 1, 0,
 349        doc: /* Convert the multibyte character CH to a byte.
 350 If the multibyte character does not represent a byte, return -1.  */)
 351      (ch)
 352      Lisp_Object ch;
 353 {
 354   int cm;
 355
 356   CHECK_CHARACTER (ch);
 357   cm = XFASTINT (ch);
 358   if (cm < 256)
 359     /* Can't distinguish a byte read from a unibyte buffer from
 360        a latin1 char, so let's let it slide.  */
 361     return ch;
 362   else
 363     {
 364       int cu = CHAR_TO_BYTE_SAFE (cm);
 365       return make_number (cu);
 366     }
 367 }
 368
 369 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 370        doc: /* Return 1 regardless of the argument CHAR.
 371 This is now an obsolete function.  We keep it just for backward compatibility.
 372 usage: (char-bytes CHAR)  */)
 373      (ch)
 374      Lisp_Object ch;
 375 {
 376   CHECK_CHARACTER (ch);
 377   return make_number (1);
 378 }
 379
 380 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 381        doc: /* Return width of CHAR when displayed in the current buffer.
 382 The width is measured by how many columns it occupies on the screen.
 383 Tab is taken to occupy `tab-width' columns.
 384 usage: (char-width CHAR)  */)
 385      (ch)
 386        Lisp_Object ch;
 387 {
 388   Lisp_Object disp;
 389   int c, width;
 390   struct Lisp_Char_Table *dp = buffer_display_table ();
 391
 392   CHECK_CHARACTER (ch);
 393   c = XINT (ch);
 394
 395   /* Get the way the display table would display it.  */
 396   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 397
 398   if (VECTORP (disp))
 399     width = ASIZE (disp);
 400   else
 401     width = CHAR_WIDTH (c);
 402
 403   return make_number (width);
 404 }
 405
 406 /* Return width of string STR of length LEN when displayed in the
 407    current buffer.  The width is measured by how many columns it
 408    occupies on the screen.  If PRECISION > 0, return the width of
 409    longest substring that doesn't exceed PRECISION, and set number of
 410    characters and bytes of the substring in *NCHARS and *NBYTES
 411    respectively.  */
 412
 413 int
 414 c_string_width (str, len, precision, nchars, nbytes)
 415      const unsigned char *str;
 416      int precision, *nchars, *nbytes;
 417 {
 418   int i = 0, i_byte = 0;
 419   int width = 0;
 420   struct Lisp_Char_Table *dp = buffer_display_table ();
 421
 422   while (i_byte < len)
 423     {
 424       int bytes, thiswidth;
 425       Lisp_Object val;
 426       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 427
 428       if (dp)
 429         {
 430           val = DISP_CHAR_VECTOR (dp, c);
 431           if (VECTORP (val))
 432             thiswidth = XVECTOR (val)->size;
 433           else
 434             thiswidth = CHAR_WIDTH (c);
 435         }
 436       else
 437         {
 438           thiswidth = CHAR_WIDTH (c);
 439         }
 440
 441       if (precision > 0
 442           && (width + thiswidth > precision))
 443         {
 444           *nchars = i;
 445           *nbytes = i_byte;
 446           return width;
 447         }
 448       i++;
 449       i_byte += bytes;
 450       width += thiswidth;
 451   }
 452
 453   if (precision > 0)
 454     {
 455       *nchars = i;
 456       *nbytes = i_byte;
 457     }
 458
 459   return width;
 460 }
 461
 462 /* Return width of string STR of length LEN when displayed in the
 463    current buffer.  The width is measured by how many columns it
 464    occupies on the screen.  */
 465
 466 int
 467 strwidth (str, len)
 468      unsigned char *str;
 469      int len;
 470 {
 471   return c_string_width (str, len, -1, NULL, NULL);
 472 }
 473
 474 /* Return width of Lisp string STRING when displayed in the current
 475    buffer.  The width is measured by how many columns it occupies on
 476    the screen while paying attention to compositions.  If PRECISION >
 477    0, return the width of longest substring that doesn't exceed
 478    PRECISION, and set number of characters and bytes of the substring
 479    in *NCHARS and *NBYTES respectively.  */
 480
 481 int
 482 lisp_string_width (string, precision, nchars, nbytes)
 483      Lisp_Object string;
 484      int precision, *nchars, *nbytes;
 485 {
 486   int len = SCHARS (string);
 487   /* This set multibyte to 0 even if STRING is multibyte when it
 488      contains only ascii and eight-bit-graphic, but that's
 489      intentional.  */
 490   int multibyte = len < SBYTES (string);
 491   unsigned char *str = SDATA (string);
 492   int i = 0, i_byte = 0;
 493   int width = 0;
 494   struct Lisp_Char_Table *dp = buffer_display_table ();
 495
 496   while (i < len)
 497     {
 498       int chars, bytes, thiswidth;
 499       Lisp_Object val;
 500       int cmp_id;
 501       EMACS_INT ignore, end;
 502
 503       if (find_composition (i, -1, &ignore, &end, &val, string)
 504           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 505               >= 0))
 506         {
 507           thiswidth = composition_table[cmp_id]->width;
 508           chars = end - i;
 509           bytes = string_char_to_byte (string, end) - i_byte;
 510         }
 511       else
 512         {
 513           int c;
 514
 515           if (multibyte)
 516             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 517           else
 518             c = str[i_byte], bytes = 1;
 519           chars = 1;
 520           if (dp)
 521             {
 522               val = DISP_CHAR_VECTOR (dp, c);
 523               if (VECTORP (val))
 524                 thiswidth = XVECTOR (val)->size;
 525               else
 526                 thiswidth = CHAR_WIDTH (c);
 527             }
 528           else
 529             {
 530               thiswidth = CHAR_WIDTH (c);
 531             }
 532         }
 533
 534       if (precision > 0
 535           && (width + thiswidth > precision))
 536         {
 537           *nchars = i;
 538           *nbytes = i_byte;
 539           return width;
 540         }
 541       i += chars;
 542       i_byte += bytes;
 543       width += thiswidth;
 544   }
 545
 546   if (precision > 0)
 547     {
 548       *nchars = i;
 549       *nbytes = i_byte;
 550     }
 551
 552   return width;
 553 }
 554
 555 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 556        doc: /* Return width of STRING when displayed in the current buffer.
 557 Width is measured by how many columns it occupies on the screen.
 558 When calculating width of a multibyte character in STRING,
 559 only the base leading-code is considered; the validity of
 560 the following bytes is not checked.  Tabs in STRING are always
 561 taken to occupy `tab-width' columns.
 562 usage: (string-width STRING)  */)
 563      (str)
 564      Lisp_Object str;
 565 {
 566   Lisp_Object val;
 567
 568   CHECK_STRING (str);
 569   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 570   return val;
 571 }
 572
 573 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 574        doc: /* Return the direction of CHAR.
 575 The returned value is 0 for left-to-right and 1 for right-to-left.
 576 usage: (char-direction CHAR)  */)
 577      (ch)
 578      Lisp_Object ch;
 579 {
 580   int c;
 581
 582   CHECK_CHARACTER (ch);
 583   c = XINT (ch);
 584   return CHAR_TABLE_REF (Vchar_direction_table, c);
 585 }
 586
 587 /* Return the number of characters in the NBYTES bytes at PTR.
 588    This works by looking at the contents and checking for multibyte
 589    sequences while assuming that there's no invalid sequence.
 590    However, if the current buffer has enable-multibyte-characters =
 591    nil, we treat each byte as a character.  */
 592
 593 EMACS_INT
 594 chars_in_text (ptr, nbytes)
 595      const unsigned char *ptr;
 596      EMACS_INT nbytes;
 597 {
 598   /* current_buffer is null at early stages of Emacs initialization.  */
 599   if (current_buffer == 0
 600       || NILP (current_buffer->enable_multibyte_characters))
 601     return nbytes;
 602
 603   return multibyte_chars_in_text (ptr, nbytes);
 604 }
 605
 606 /* Return the number of characters in the NBYTES bytes at PTR.
 607    This works by looking at the contents and checking for multibyte
 608    sequences while assuming that there's no invalid sequence.  It
 609    ignores enable-multibyte-characters.  */
 610
 611 EMACS_INT
 612 multibyte_chars_in_text (ptr, nbytes)
 613      const unsigned char *ptr;
 614      EMACS_INT nbytes;
 615 {
 616   const unsigned char *endp = ptr + nbytes;
 617   int chars = 0;
 618
 619   while (ptr < endp)
 620     {
 621       int len = MULTIBYTE_LENGTH (ptr, endp);
 622
 623       if (len == 0)
 624         abort ();
 625       ptr += len;
 626       chars++;
 627     }
 628
 629   return chars;
 630 }
 631
 632 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 633    characters and bytes in it, and store them in *NCHARS and *NBYTES
 634    respectively.  On counting bytes, pay attention to that 8-bit
 635    characters not constructing a valid multibyte sequence are
 636    represented by 2-byte in a multibyte text.  */
 637
 638 void
 639 parse_str_as_multibyte (str, len, nchars, nbytes)
 640      const unsigned char *str;
 641      int len, *nchars, *nbytes;
 642 {
 643   const unsigned char *endp = str + len;
 644   int n, chars = 0, bytes = 0;
 645
 646   if (len >= MAX_MULTIBYTE_LENGTH)
 647     {
 648       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 649       while (str < adjusted_endp)
 650         {
 651           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 652             str += n, bytes += n;
 653           else
 654             str++, bytes += 2;
 655           chars++;
 656         }
 657     }
 658   while (str < endp)
 659     {
 660       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 661         str += n, bytes += n;
 662       else
 663         str++, bytes += 2;
 664       chars++;
 665     }
 666
 667   *nchars = chars;
 668   *nbytes = bytes;
 669   return;
 670 }
 671
 672 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 673    It actually converts only such 8-bit characters that don't contruct
 674    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 675    NCHARS is nonzero, set *NCHARS to the number of characters in the
 676    text.  It is assured that we can use LEN bytes at STR as a work
 677    area and that is enough.  Return the number of bytes of the
 678    resulting text.  */
 679
 680 int
 681 str_as_multibyte (str, len, nbytes, nchars)
 682      unsigned char *str;
 683      int len, nbytes, *nchars;
 684 {
 685   unsigned char *p = str, *endp = str + nbytes;
 686   unsigned char *to;
 687   int chars = 0;
 688   int n;
 689
 690   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 691     {
 692       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 693       while (p < adjusted_endp
 694              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 695         p += n, chars++;
 696     }
 697   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 698     p += n, chars++;
 699   if (nchars)
 700     *nchars = chars;
 701   if (p == endp)
 702     return nbytes;
 703
 704   to = p;
 705   nbytes = endp - p;
 706   endp = str + len;
 707   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 708   p = endp - nbytes;
 709
 710   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 711     {
 712       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 713       while (p < adjusted_endp)
 714         {
 715           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 716             {
 717               while (n--)
 718                 *to++ = *p++;
 719             }
 720           else
 721             {
 722               int c = *p++;
 723               c = BYTE8_TO_CHAR (c);
 724               to += CHAR_STRING (c, to);
 725             }
 726         }
 727       chars++;
 728     }
 729   while (p < endp)
 730     {
 731       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 732         {
 733           while (n--)
 734             *to++ = *p++;
 735         }
 736       else
 737         {
 738           int c = *p++;
 739           c = BYTE8_TO_CHAR (c);
 740           to += CHAR_STRING (c, to);
 741         }
 742       chars++;
 743     }
 744   if (nchars)
 745     *nchars = chars;
 746   return (to - str);
 747 }
 748
 749 /* Parse unibyte string at STR of LEN bytes, and return the number of
 750    bytes it may ocupy when converted to multibyte string by
 751    `str_to_multibyte'.  */
 752
 753 int
 754 parse_str_to_multibyte (str, len)
 755      unsigned char *str;
 756      int len;
 757 {
 758   unsigned char *endp = str + len;
 759   int bytes;
 760
 761   for (bytes = 0; str < endp; str++)
 762     bytes += (*str < 0x80) ? 1 : 2;
 763   return bytes;
 764 }
 765
 766
 767 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 768    that contains the same single-byte characters.  It actually
 769    converts all 8-bit characters to multibyte forms.  It is assured
 770    that we can use LEN bytes at STR as a work area and that is
 771    enough.  */
 772
 773 int
 774 str_to_multibyte (str, len, bytes)
 775      unsigned char *str;
 776      int len, bytes;
 777 {
 778   unsigned char *p = str, *endp = str + bytes;
 779   unsigned char *to;
 780
 781   while (p < endp && *p < 0x80) p++;
 782   if (p == endp)
 783     return bytes;
 784   to = p;
 785   bytes = endp - p;
 786   endp = str + len;
 787   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 788   p = endp - bytes;
 789   while (p < endp)
 790     {
 791       int c = *p++;
 792
 793       if (c >= 0x80)
 794         c = BYTE8_TO_CHAR (c);
 795       to += CHAR_STRING (c, to);
 796     }
 797   return (to - str);
 798 }
 799
 800 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 801    actually converts characters in the range 0x80..0xFF to
 802    unibyte.  */
 803
 804 int
 805 str_as_unibyte (str, bytes)
 806      unsigned char *str;
 807      int bytes;
 808 {
 809   const unsigned char *p = str, *endp = str + bytes;
 810   unsigned char *to;
 811   int c, len;
 812
 813   while (p < endp)
 814     {
 815       c = *p;
 816       len = BYTES_BY_CHAR_HEAD (c);
 817       if (CHAR_BYTE8_HEAD_P (c))
 818         break;
 819       p += len;
 820     }
 821   to = str + (p - str);
 822   while (p < endp)
 823     {
 824       c = *p;
 825       len = BYTES_BY_CHAR_HEAD (c);
 826       if (CHAR_BYTE8_HEAD_P (c))
 827         {
 828           c = STRING_CHAR_ADVANCE (p);
 829           *to++ = CHAR_TO_BYTE8 (c);
 830         }
 831       else
 832         {
 833           while (len--) *to++ = *p++;
 834         }
 835     }
 836   return (to - str);
 837 }
 838
 839 /* Convert eight-bit chars in SRC (in multibyte form) to the
 840    corresponding byte and store in DST.  CHARS is the number of
 841    characters in SRC.  The value is the number of bytes stored in DST.
 842    Usually, the value is the same as CHARS, but is less than it if SRC
 843    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 844    is nonzero, a Latin-1 character is accepted and converted to a byte
 845    of that character code.
 846    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 847
 848 EMACS_INT
 849 str_to_unibyte (src, dst, chars, accept_latin_1)
 850      const unsigned char *src;
 851      unsigned char *dst;
 852      EMACS_INT chars;
 853      int accept_latin_1;
 854 {
 855   EMACS_INT i;
 856
 857   for (i = 0; i < chars; i++)
 858     {
 859       int c = STRING_CHAR_ADVANCE (src);
 860
 861       if (CHAR_BYTE8_P (c))
 862         c = CHAR_TO_BYTE8 (c);
 863       else if (! ASCII_CHAR_P (c)
 864                && (! accept_latin_1 || c >= 0x100))
 865         return i;
 866       *dst++ = c;
 867     }
 868   return i;
 869 }
 870
 871
 872 int
 873 string_count_byte8 (string)
 874      Lisp_Object string;
 875 {
 876   int multibyte = STRING_MULTIBYTE (string);
 877   int nbytes = SBYTES (string);
 878   unsigned char *p = SDATA (string);
 879   unsigned char *pend = p + nbytes;
 880   int count = 0;
 881   int c, len;
 882
 883   if (multibyte)
 884     while (p < pend)
 885       {
 886         c = *p;
 887         len = BYTES_BY_CHAR_HEAD (c);
 888
 889         if (CHAR_BYTE8_HEAD_P (c))
 890           count++;
 891         p += len;
 892       }
 893   else
 894     while (p < pend)
 895       {
 896         if (*p++ >= 0x80)
 897           count++;
 898       }
 899   return count;
 900 }
 901
 902
 903 Lisp_Object
 904 string_escape_byte8 (string)
 905      Lisp_Object string;
 906 {
 907   int nchars = SCHARS (string);
 908   int nbytes = SBYTES (string);
 909   int multibyte = STRING_MULTIBYTE (string);
 910   int byte8_count;
 911   const unsigned char *src, *src_end;
 912   unsigned char *dst;
 913   Lisp_Object val;
 914   int c, len;
 915
 916   if (multibyte && nchars == nbytes)
 917     return string;
 918
 919   byte8_count = string_count_byte8 (string);
 920
 921   if (byte8_count == 0)
 922     return string;
 923
 924   if (multibyte)
 925     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 926     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 927                                         nbytes + byte8_count * 2);
 928   else
 929     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 930     val = make_uninit_string (nbytes + byte8_count * 3);
 931
 932   src = SDATA (string);
 933   src_end = src + nbytes;
 934   dst = SDATA (val);
 935   if (multibyte)
 936     while (src < src_end)
 937       {
 938         c = *src;
 939         len = BYTES_BY_CHAR_HEAD (c);
 940
 941         if (CHAR_BYTE8_HEAD_P (c))
 942           {
 943             c = STRING_CHAR_ADVANCE (src);
 944             c = CHAR_TO_BYTE8 (c);
 945             sprintf ((char *) dst, "\\%03o", c);
 946             dst += 4;
 947           }
 948         else
 949           while (len--) *dst++ = *src++;
 950       }
 951   else
 952     while (src < src_end)
 953       {
 954         c = *src++;
 955         if (c >= 0x80)
 956           {
 957             sprintf ((char *) dst, "\\%03o", c);
 958             dst += 4;
 959           }
 960         else
 961           *dst++ = c;
 962       }
 963   return val;
 964 }
 965
 966 \f
 967 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 968        doc: /*
 969 Concatenate all the argument characters and make the result a string.
 970 usage: (string &rest CHARACTERS)  */)
 971      (n, args)
 972      int n;
 973      Lisp_Object *args;
 974 {
 975   int i;
 976   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 977   unsigned char *p = buf;
 978   int c;
 979
 980   for (i = 0; i < n; i++)
 981     {
 982       CHECK_CHARACTER (args[i]);
 983       c = XINT (args[i]);
 984       p += CHAR_STRING (c, p);
 985     }
 986
 987   return make_string_from_bytes ((char *) buf, n, p - buf);
 988 }
 989
 990 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 991        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 992 usage: (unibyte-string &rest BYTES)  */)
 993      (n, args)
 994      int n;
 995      Lisp_Object *args;
 996 {
 997   int i;
 998   unsigned char *buf = (unsigned char *) alloca (n);
 999   unsigned char *p = buf;
1000   unsigned c;
1001
1002   for (i = 0; i < n; i++)
1003     {
1004       CHECK_NATNUM (args[i]);
1005       c = XFASTINT (args[i]);
1006       if (c >= 256)
1007         args_out_of_range_3 (args[i], make_number (0), make_number (255));
1008       *p++ = c;
1009     }
1010
1011   return make_string_from_bytes ((char *) buf, n, p - buf);
1012 }
1013
1014 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
1015        Schar_resolve_modifiers, 1, 1, 0,
1016        doc: /* Resolve modifiers in the character CHAR.
1017 The value is a character with modifiers resolved into the character
1018 code.  Unresolved modifiers are kept in the value.
1019 usage: (char-resolve-modifiers CHAR)  */)
1020      (character)
1021      Lisp_Object character;
1022 {
1023   int c;
1024
1025   CHECK_NUMBER (character);
1026   c = XINT (character);
1027   return make_number (char_resolve_modifier_mask (c));
1028 }
1029
1030 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1031        doc: /* Return a byte value of a character at point.
1032 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1033 a byte value.
1034 Optional 2nd arg STRING, if non-nil, is a string of which first
1035 character is a target to get a byte value.  In this case, POSITION, if
1036 non-nil, is an index of a target character in the string.
1037
1038 If the current buffer (or STRING) is multibyte, and the target
1039 character is not ASCII nor 8-bit character, an error is signalled.  */)
1040      (position, string)
1041      Lisp_Object position, string;
1042 {
1043   int c;
1044   EMACS_INT pos;
1045   unsigned char *p;
1046
1047   if (NILP (string))
1048     {
1049       if (NILP (position))
1050         {
1051           p = PT_ADDR;
1052         }
1053       else
1054         {
1055           CHECK_NUMBER_COERCE_MARKER (position);
1056           if (XINT (position) < BEGV || XINT (position) >= ZV)
1057             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1058           pos = XFASTINT (position);
1059           p = CHAR_POS_ADDR (pos);
1060         }
1061       if (NILP (current_buffer->enable_multibyte_characters))
1062         return make_number (*p);
1063     }
1064   else
1065     {
1066       CHECK_STRING (string);
1067       if (NILP (position))
1068         {
1069           p = SDATA (string);
1070         }
1071       else
1072         {
1073           CHECK_NATNUM (position);
1074           if (XINT (position) >= SCHARS (string))
1075             args_out_of_range (string, position);
1076           pos = XFASTINT (position);
1077           p = SDATA (string) + string_char_to_byte (string, pos);
1078         }
1079       if (! STRING_MULTIBYTE (string))
1080         return make_number (*p);
1081     }
1082   c = STRING_CHAR (p, 0);
1083   if (CHAR_BYTE8_P (c))
1084     c = CHAR_TO_BYTE8 (c);
1085   else if (! ASCII_CHAR_P (c))
1086     error ("Not an ASCII nor an 8-bit character: %d", c);
1087   return make_number (c);
1088 }
1089
1090
1091 void
1092 init_character_once ()
1093 {
1094 }
1095
1096 #ifdef emacs
1097
1098 void
1099 syms_of_character ()
1100 {
1101   DEFSYM (Qcharacterp, "characterp");
1102   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1103
1104   staticpro (&Vchar_unify_table);
1105   Vchar_unify_table = Qnil;
1106
1107   defsubr (&Smax_char);
1108   defsubr (&Scharacterp);
1109   defsubr (&Sunibyte_char_to_multibyte);
1110   defsubr (&Smultibyte_char_to_unibyte);
1111   defsubr (&Schar_bytes);
1112   defsubr (&Schar_width);
1113   defsubr (&Sstring_width);
1114   defsubr (&Schar_direction);
1115   defsubr (&Sstring);
1116   defsubr (&Sunibyte_string);
1117   defsubr (&Schar_resolve_modifiers);
1118   defsubr (&Sget_byte);
1119
1120   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1121                doc: /*
1122 Vector recording all translation tables ever defined.
1123 Each element is a pair (SYMBOL . TABLE) relating the table to the
1124 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1125   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1126
1127   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1128                doc: /*
1129 A char-table for characters which invoke auto-filling.
1130 Such characters have value t in this table.  */);
1131   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1132   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1133   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1134
1135   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1136                doc: /*
1137 A char-table for width (columns) of each character.  */);
1138   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1139   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1140   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1141                         make_number (4));
1142
1143   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1144                doc: /* A char-table for direction of each character.  */);
1145   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1146
1147   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1148                doc: /* A char-table for each printable character.  */);
1149   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1150   Fset_char_table_range (Vprintable_chars,
1151                          Fcons (make_number (32), make_number (126)), Qt);
1152   Fset_char_table_range (Vprintable_chars,
1153                          Fcons (make_number (160),
1154                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1155
1156   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1157                doc: /* Char table of script symbols.
1158 It has one extra slot whose value is a list of script symbols.  */);
1159
1160   /* Intern this now in case it isn't already done.
1161      Setting this variable twice is harmless.
1162      But don't staticpro it here--that is done in alloc.c.  */
1163   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1164   DEFSYM (Qchar_script_table, "char-script-table");
1165   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1166   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1167
1168   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1169                doc: /* Alist of scripts vs the representative characters.
1170 Each element is a cons (SCRIPT . CHARS), where SCRIPT is a script name symbol,
1171 CHARS is a list or a vector of characters.
1172 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1173 If it is a vector, one of the characters in the vector is necessary.
1174 This variable is used to find a font for a specific script.  */);
1175   Vscript_representative_chars = Qnil;
1176
1177   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1178                doc: /* Char table of Unicode's "General Category".
1179 All Unicode characters have one of the following values (symbol):
1180   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1181   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1182 See The Unicode Standard for the meaning of those values.  */);
1183   /* The correct char-table is setup in characters.el.  */
1184   Vunicode_category_table = Qnil;
1185 }
1186
1187 #endif /* emacs */
1188
1189 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1190    (do not change this comment) */