src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include "lisp.h"
  38 #include "character.h"
  39 #include "buffer.h"
  40 #include "charset.h"
  41 #include "composite.h"
  42 #include "disptab.h"
  43
  44 #else  /* not emacs */
  45
  46 #include "mulelib.h"
  47
  48 #endif /* emacs */
  49
  50 Lisp_Object Qcharacterp;
  51
  52 /* Vector of translation table ever defined.
  53    ID of a translation table is used to index this vector.  */
  54 Lisp_Object Vtranslation_table_vector;
  55
  56 /* A char-table for characters which may invoke auto-filling.  */
  57 Lisp_Object Vauto_fill_chars;
  58
  59 Lisp_Object Qauto_fill_chars;
  60
  61 /* Char-table of information about which character to unify to which
  62    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  63 Lisp_Object Vchar_unify_table;
  64
  65 /* A char-table.  An element is non-nil iff the corresponding
  66    character has a printable glyph.  */
  67 Lisp_Object Vprintable_chars;
  68
  69 /* A char-table.  An elemnent is a column-width of the corresponding
  70    character.  */
  71 Lisp_Object Vchar_width_table;
  72
  73 /* A char-table.  An element is a symbol indicating the direction
  74    property of corresponding character.  */
  75 Lisp_Object Vchar_direction_table;
  76
  77 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  78 unsigned char *_fetch_multibyte_char_p;
  79
  80 /* Char table of scripts.  */
  81 Lisp_Object Vchar_script_table;
  82
  83 /* Alist of scripts vs representative characters.  */
  84 Lisp_Object Vscript_representative_chars;
  85
  86 static Lisp_Object Qchar_script_table;
  87
  88 Lisp_Object Vunicode_category_table;
  89
  90 /* Mapping table from unibyte chars to multibyte chars.  */
  91 int unibyte_to_multibyte_table[256];
  92
  93 /* Nth element is 1 iff unibyte char N can be mapped to a multibyte
  94    char.  */
  95 char unibyte_has_multibyte_table[256];
  96
  97 \f
  98
  99 /* If character code C has modifier masks, reflect them to the
 100    character code if possible.  Return the resulting code.  */
 101
 102 int
 103 char_resolve_modifier_mask (c)
 104      int c;
 105 {
 106   /* A non-ASCII character can't reflect modifier bits to the code.  */
 107   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 108     return c;
 109
 110   /* For Meta, Shift, and Control modifiers, we need special care.  */
 111   if (c & CHAR_SHIFT)
 112     {
 113       /* Shift modifier is valid only with [A-Za-z].  */
 114       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 115         c &= ~CHAR_SHIFT;
 116       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 117         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 118       /* Shift modifier for control characters and SPC is ignored.  */
 119       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 120         c &= ~CHAR_SHIFT;
 121     }
 122   if (c & CHAR_CTL)
 123     {
 124       /* Simulate the code in lread.c.  */
 125       /* Allow `\C- ' and `\C-?'.  */
 126       if ((c & 0377) == ' ')
 127         c &= ~0177 & ~ CHAR_CTL;
 128       else if ((c & 0377) == '?')
 129         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 130       /* ASCII control chars are made from letters (both cases),
 131          as well as the non-letters within 0100...0137.  */
 132       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 133         c &= (037 | (~0177 & ~CHAR_CTL));
 134       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 135         c &= (037 | (~0177 & ~CHAR_CTL));
 136     }
 137   if (c & CHAR_META)
 138     {
 139       /* Move the meta bit to the right place for a string.  */
 140       c = (c & ~CHAR_META) | 0x80;
 141     }
 142
 143   return c;
 144 }
 145
 146
 147 /* Store multibyte form of character C at P.  If C has modifier bits,
 148    handle them appropriately.  */
 149
 150 int
 151 char_string (c, p)
 152      unsigned c;
 153      unsigned char *p;
 154 {
 155   int bytes;
 156
 157   if (c & CHAR_MODIFIER_MASK)
 158     {
 159       c = (unsigned) char_resolve_modifier_mask ((int) c);
 160       /* If C still has any modifier bits, just ignore it.  */
 161       c &= ~CHAR_MODIFIER_MASK;
 162     }
 163
 164   MAYBE_UNIFY_CHAR (c);
 165
 166   if (c <= MAX_3_BYTE_CHAR)
 167     {
 168       bytes = CHAR_STRING (c, p);
 169     }
 170   else if (c <= MAX_4_BYTE_CHAR)
 171     {
 172       p[0] = (0xF0 | (c >> 18));
 173       p[1] = (0x80 | ((c >> 12) & 0x3F));
 174       p[2] = (0x80 | ((c >> 6) & 0x3F));
 175       p[3] = (0x80 | (c & 0x3F));
 176       bytes = 4;
 177     }
 178   else if (c <= MAX_5_BYTE_CHAR)
 179     {
 180       p[0] = 0xF8;
 181       p[1] = (0x80 | ((c >> 18) & 0x0F));
 182       p[2] = (0x80 | ((c >> 12) & 0x3F));
 183       p[3] = (0x80 | ((c >> 6) & 0x3F));
 184       p[4] = (0x80 | (c & 0x3F));
 185       bytes = 5;
 186     }
 187   else if (c <= MAX_CHAR)
 188     {
 189       c = CHAR_TO_BYTE8 (c);
 190       bytes = BYTE8_STRING (c, p);
 191     }
 192   else
 193     error ("Invalid character: %d", c);
 194
 195   return bytes;
 196 }
 197
 198
 199 /* Return a character whose multibyte form is at P.  Set LEN is not
 200    NULL, it must be a pointer to integer.  In that case, set *LEN to
 201    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 202    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 203    the ending address (i.e. the starting address of the next
 204    character) of the multibyte form.  */
 205
 206 int
 207 string_char (p, advanced, len)
 208      const unsigned char *p;
 209      const unsigned char **advanced;
 210      int *len;
 211 {
 212   int c;
 213   const unsigned char *saved_p = p;
 214
 215   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 216     {
 217       c = STRING_CHAR_ADVANCE (p);
 218     }
 219   else if (! (*p & 0x08))
 220     {
 221       c = ((((p)[0] & 0xF) << 18)
 222            | (((p)[1] & 0x3F) << 12)
 223            | (((p)[2] & 0x3F) << 6)
 224            | ((p)[3] & 0x3F));
 225       p += 4;
 226     }
 227   else
 228     {
 229       c = ((((p)[1] & 0x3F) << 18)
 230            | (((p)[2] & 0x3F) << 12)
 231            | (((p)[3] & 0x3F) << 6)
 232            | ((p)[4] & 0x3F));
 233       p += 5;
 234     }
 235
 236   MAYBE_UNIFY_CHAR (c);
 237
 238   if (len)
 239     *len = p - saved_p;
 240   if (advanced)
 241     *advanced = p;
 242   return c;
 243 }
 244
 245
 246 /* Translate character C by translation table TABLE.  If C is
 247    negative, translate a character specified by CHARSET and CODE.  If
 248    no translation is found in TABLE, return the untranslated
 249    character.  If TABLE is a list, elements are char tables.  In this
 250    case, translace C by all tables.  */
 251
 252 int
 253 translate_char (table, c)
 254      Lisp_Object table;
 255      int c;
 256 {
 257   if (CHAR_TABLE_P (table))
 258     {
 259       Lisp_Object ch;
 260
 261       ch = CHAR_TABLE_REF (table, c);
 262       if (CHARACTERP (ch))
 263         c = XINT (ch);
 264     }
 265   else
 266     {
 267       for (; CONSP (table); table = XCDR (table))
 268         c = translate_char (XCAR (table), c);
 269     }
 270   return c;
 271 }
 272
 273 /* Convert the multibyte character C to unibyte 8-bit character based
 274    on the current value of charset_unibyte.  If dimension of
 275    charset_unibyte is more than one, return (C & 0xFF).
 276
 277    The argument REV_TBL is now ignored.  It will be removed in the
 278    future.  */
 279
 280 int
 281 multibyte_char_to_unibyte (c, rev_tbl)
 282      int c;
 283      Lisp_Object rev_tbl;
 284 {
 285   struct charset *charset;
 286   unsigned c1;
 287
 288   if (CHAR_BYTE8_P (c))
 289     return CHAR_TO_BYTE8 (c);
 290   charset = CHARSET_FROM_ID (charset_unibyte);
 291   c1 = ENCODE_CHAR (charset, c);
 292   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : c & 0xFF);
 293 }
 294
 295 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 296    by charset_unibyte.  */
 297
 298 int
 299 multibyte_char_to_unibyte_safe (c)
 300      int c;
 301 {
 302   struct charset *charset;
 303   unsigned c1;
 304
 305   if (CHAR_BYTE8_P (c))
 306     return CHAR_TO_BYTE8 (c);
 307   charset = CHARSET_FROM_ID (charset_unibyte);
 308   c1 = ENCODE_CHAR (charset, c);
 309   return ((c1 != CHARSET_INVALID_CODE (charset)) ? c1 : -1);
 310 }
 311
 312 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 313        doc: /* Return non-nil if OBJECT is a character.  */)
 314      (object, ignore)
 315      Lisp_Object object, ignore;
 316 {
 317   return (CHARACTERP (object) ? Qt : Qnil);
 318 }
 319
 320 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 321        doc: /* Return the character of the maximum code.  */)
 322      ()
 323 {
 324   return make_number (MAX_CHAR);
 325 }
 326
 327 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 328        Sunibyte_char_to_multibyte, 1, 1, 0,
 329        doc: /* Convert the byte CH to multibyte character.  */)
 330      (ch)
 331      Lisp_Object ch;
 332 {
 333   int c;
 334   struct charset *charset;
 335
 336   CHECK_CHARACTER (ch);
 337   c = XFASTINT (ch);
 338   if (c >= 0400)
 339     error ("Invalid unibyte character: %d", c);
 340   charset = CHARSET_FROM_ID (charset_unibyte);
 341   c = DECODE_CHAR (charset, c);
 342   if (c < 0)
 343     c = BYTE8_TO_CHAR (XFASTINT (ch));
 344   return make_number (c);
 345 }
 346
 347 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 348        Smultibyte_char_to_unibyte, 1, 1, 0,
 349        doc: /* Convert the multibyte character CH to a byte.
 350 If the multibyte character does not represent a byte, return -1.  */)
 351      (ch)
 352      Lisp_Object ch;
 353 {
 354   int cm;
 355
 356   CHECK_CHARACTER (ch);
 357   cm = XFASTINT (ch);
 358   if (cm < 256)
 359     /* Can't distinguish a byte read from a unibyte buffer from
 360        a latin1 char, so let's let it slide.  */
 361     return ch;
 362   else
 363     {
 364       int cu = CHAR_TO_BYTE_SAFE (cm);
 365       return make_number (cu);
 366     }
 367 }
 368
 369 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 370        doc: /* Return 1 regardless of the argument CHAR.
 371 This is now an obsolete function.  We keep it just for backward compatibility.
 372 usage: (char-bytes CHAR)  */)
 373      (ch)
 374      Lisp_Object ch;
 375 {
 376   CHECK_CHARACTER (ch);
 377   return make_number (1);
 378 }
 379
 380 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 381        doc: /* Return width of CHAR when displayed in the current buffer.
 382 The width is measured by how many columns it occupies on the screen.
 383 Tab is taken to occupy `tab-width' columns.
 384 usage: (char-width CHAR)  */)
 385      (ch)
 386        Lisp_Object ch;
 387 {
 388   Lisp_Object disp;
 389   int c, width;
 390   struct Lisp_Char_Table *dp = buffer_display_table ();
 391
 392   CHECK_CHARACTER (ch);
 393   c = XINT (ch);
 394
 395   /* Get the way the display table would display it.  */
 396   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 397
 398   if (VECTORP (disp))
 399     width = ASIZE (disp);
 400   else
 401     width = CHAR_WIDTH (c);
 402
 403   return make_number (width);
 404 }
 405
 406 /* Return width of string STR of length LEN when displayed in the
 407    current buffer.  The width is measured by how many columns it
 408    occupies on the screen.  If PRECISION > 0, return the width of
 409    longest substring that doesn't exceed PRECISION, and set number of
 410    characters and bytes of the substring in *NCHARS and *NBYTES
 411    respectively.  */
 412
 413 int
 414 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 415 {
 416   int i = 0, i_byte = 0;
 417   int width = 0;
 418   struct Lisp_Char_Table *dp = buffer_display_table ();
 419
 420   while (i_byte < len)
 421     {
 422       int bytes, thiswidth;
 423       Lisp_Object val;
 424       int c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 425
 426       if (dp)
 427         {
 428           val = DISP_CHAR_VECTOR (dp, c);
 429           if (VECTORP (val))
 430             thiswidth = XVECTOR (val)->size;
 431           else
 432             thiswidth = CHAR_WIDTH (c);
 433         }
 434       else
 435         {
 436           thiswidth = CHAR_WIDTH (c);
 437         }
 438
 439       if (precision > 0
 440           && (width + thiswidth > precision))
 441         {
 442           *nchars = i;
 443           *nbytes = i_byte;
 444           return width;
 445         }
 446       i++;
 447       i_byte += bytes;
 448       width += thiswidth;
 449   }
 450
 451   if (precision > 0)
 452     {
 453       *nchars = i;
 454       *nbytes = i_byte;
 455     }
 456
 457   return width;
 458 }
 459
 460 /* Return width of string STR of length LEN when displayed in the
 461    current buffer.  The width is measured by how many columns it
 462    occupies on the screen.  */
 463
 464 int
 465 strwidth (str, len)
 466      unsigned char *str;
 467      int len;
 468 {
 469   return c_string_width (str, len, -1, NULL, NULL);
 470 }
 471
 472 /* Return width of Lisp string STRING when displayed in the current
 473    buffer.  The width is measured by how many columns it occupies on
 474    the screen while paying attention to compositions.  If PRECISION >
 475    0, return the width of longest substring that doesn't exceed
 476    PRECISION, and set number of characters and bytes of the substring
 477    in *NCHARS and *NBYTES respectively.  */
 478
 479 int
 480 lisp_string_width (string, precision, nchars, nbytes)
 481      Lisp_Object string;
 482      int precision, *nchars, *nbytes;
 483 {
 484   int len = SCHARS (string);
 485   /* This set multibyte to 0 even if STRING is multibyte when it
 486      contains only ascii and eight-bit-graphic, but that's
 487      intentional.  */
 488   int multibyte = len < SBYTES (string);
 489   unsigned char *str = SDATA (string);
 490   int i = 0, i_byte = 0;
 491   int width = 0;
 492   struct Lisp_Char_Table *dp = buffer_display_table ();
 493
 494   while (i < len)
 495     {
 496       int chars, bytes, thiswidth;
 497       Lisp_Object val;
 498       int cmp_id;
 499       EMACS_INT ignore, end;
 500
 501       if (find_composition (i, -1, &ignore, &end, &val, string)
 502           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 503               >= 0))
 504         {
 505           thiswidth = composition_table[cmp_id]->width;
 506           chars = end - i;
 507           bytes = string_char_to_byte (string, end) - i_byte;
 508         }
 509       else
 510         {
 511           int c;
 512
 513           if (multibyte)
 514             c = STRING_CHAR_AND_LENGTH (str + i_byte, len - i_byte, bytes);
 515           else
 516             c = str[i_byte], bytes = 1;
 517           chars = 1;
 518           if (dp)
 519             {
 520               val = DISP_CHAR_VECTOR (dp, c);
 521               if (VECTORP (val))
 522                 thiswidth = XVECTOR (val)->size;
 523               else
 524                 thiswidth = CHAR_WIDTH (c);
 525             }
 526           else
 527             {
 528               thiswidth = CHAR_WIDTH (c);
 529             }
 530         }
 531
 532       if (precision > 0
 533           && (width + thiswidth > precision))
 534         {
 535           *nchars = i;
 536           *nbytes = i_byte;
 537           return width;
 538         }
 539       i += chars;
 540       i_byte += bytes;
 541       width += thiswidth;
 542   }
 543
 544   if (precision > 0)
 545     {
 546       *nchars = i;
 547       *nbytes = i_byte;
 548     }
 549
 550   return width;
 551 }
 552
 553 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 554        doc: /* Return width of STRING when displayed in the current buffer.
 555 Width is measured by how many columns it occupies on the screen.
 556 When calculating width of a multibyte character in STRING,
 557 only the base leading-code is considered; the validity of
 558 the following bytes is not checked.  Tabs in STRING are always
 559 taken to occupy `tab-width' columns.
 560 usage: (string-width STRING)  */)
 561      (str)
 562      Lisp_Object str;
 563 {
 564   Lisp_Object val;
 565
 566   CHECK_STRING (str);
 567   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 568   return val;
 569 }
 570
 571 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 572        doc: /* Return the direction of CHAR.
 573 The returned value is 0 for left-to-right and 1 for right-to-left.
 574 usage: (char-direction CHAR)  */)
 575      (ch)
 576      Lisp_Object ch;
 577 {
 578   int c;
 579
 580   CHECK_CHARACTER (ch);
 581   c = XINT (ch);
 582   return CHAR_TABLE_REF (Vchar_direction_table, c);
 583 }
 584
 585 /* Return the number of characters in the NBYTES bytes at PTR.
 586    This works by looking at the contents and checking for multibyte
 587    sequences while assuming that there's no invalid sequence.
 588    However, if the current buffer has enable-multibyte-characters =
 589    nil, we treat each byte as a character.  */
 590
 591 EMACS_INT
 592 chars_in_text (ptr, nbytes)
 593      const unsigned char *ptr;
 594      EMACS_INT nbytes;
 595 {
 596   /* current_buffer is null at early stages of Emacs initialization.  */
 597   if (current_buffer == 0
 598       || NILP (current_buffer->enable_multibyte_characters))
 599     return nbytes;
 600
 601   return multibyte_chars_in_text (ptr, nbytes);
 602 }
 603
 604 /* Return the number of characters in the NBYTES bytes at PTR.
 605    This works by looking at the contents and checking for multibyte
 606    sequences while assuming that there's no invalid sequence.  It
 607    ignores enable-multibyte-characters.  */
 608
 609 EMACS_INT
 610 multibyte_chars_in_text (ptr, nbytes)
 611      const unsigned char *ptr;
 612      EMACS_INT nbytes;
 613 {
 614   const unsigned char *endp = ptr + nbytes;
 615   int chars = 0;
 616
 617   while (ptr < endp)
 618     {
 619       int len = MULTIBYTE_LENGTH (ptr, endp);
 620
 621       if (len == 0)
 622         abort ();
 623       ptr += len;
 624       chars++;
 625     }
 626
 627   return chars;
 628 }
 629
 630 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 631    characters and bytes in it, and store them in *NCHARS and *NBYTES
 632    respectively.  On counting bytes, pay attention to that 8-bit
 633    characters not constructing a valid multibyte sequence are
 634    represented by 2-byte in a multibyte text.  */
 635
 636 void
 637 parse_str_as_multibyte (str, len, nchars, nbytes)
 638      const unsigned char *str;
 639      int len, *nchars, *nbytes;
 640 {
 641   const unsigned char *endp = str + len;
 642   int n, chars = 0, bytes = 0;
 643
 644   if (len >= MAX_MULTIBYTE_LENGTH)
 645     {
 646       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 647       while (str < adjusted_endp)
 648         {
 649           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 650             str += n, bytes += n;
 651           else
 652             str++, bytes += 2;
 653           chars++;
 654         }
 655     }
 656   while (str < endp)
 657     {
 658       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 659         str += n, bytes += n;
 660       else
 661         str++, bytes += 2;
 662       chars++;
 663     }
 664
 665   *nchars = chars;
 666   *nbytes = bytes;
 667   return;
 668 }
 669
 670 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 671    It actually converts only such 8-bit characters that don't contruct
 672    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 673    NCHARS is nonzero, set *NCHARS to the number of characters in the
 674    text.  It is assured that we can use LEN bytes at STR as a work
 675    area and that is enough.  Return the number of bytes of the
 676    resulting text.  */
 677
 678 int
 679 str_as_multibyte (str, len, nbytes, nchars)
 680      unsigned char *str;
 681      int len, nbytes, *nchars;
 682 {
 683   unsigned char *p = str, *endp = str + nbytes;
 684   unsigned char *to;
 685   int chars = 0;
 686   int n;
 687
 688   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 689     {
 690       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 691       while (p < adjusted_endp
 692              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 693         p += n, chars++;
 694     }
 695   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 696     p += n, chars++;
 697   if (nchars)
 698     *nchars = chars;
 699   if (p == endp)
 700     return nbytes;
 701
 702   to = p;
 703   nbytes = endp - p;
 704   endp = str + len;
 705   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 706   p = endp - nbytes;
 707
 708   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 709     {
 710       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 711       while (p < adjusted_endp)
 712         {
 713           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 714             {
 715               while (n--)
 716                 *to++ = *p++;
 717             }
 718           else
 719             {
 720               int c = *p++;
 721               c = BYTE8_TO_CHAR (c);
 722               to += CHAR_STRING (c, to);
 723             }
 724         }
 725       chars++;
 726     }
 727   while (p < endp)
 728     {
 729       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 730         {
 731           while (n--)
 732             *to++ = *p++;
 733         }
 734       else
 735         {
 736           int c = *p++;
 737           c = BYTE8_TO_CHAR (c);
 738           to += CHAR_STRING (c, to);
 739         }
 740       chars++;
 741     }
 742   if (nchars)
 743     *nchars = chars;
 744   return (to - str);
 745 }
 746
 747 /* Parse unibyte string at STR of LEN bytes, and return the number of
 748    bytes it may ocupy when converted to multibyte string by
 749    `str_to_multibyte'.  */
 750
 751 int
 752 parse_str_to_multibyte (str, len)
 753      unsigned char *str;
 754      int len;
 755 {
 756   unsigned char *endp = str + len;
 757   int bytes;
 758
 759   for (bytes = 0; str < endp; str++)
 760     bytes += (*str < 0x80) ? 1 : 2;
 761   return bytes;
 762 }
 763
 764
 765 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 766    that contains the same single-byte characters.  It actually
 767    converts all 8-bit characters to multibyte forms.  It is assured
 768    that we can use LEN bytes at STR as a work area and that is
 769    enough.  */
 770
 771 int
 772 str_to_multibyte (str, len, bytes)
 773      unsigned char *str;
 774      int len, bytes;
 775 {
 776   unsigned char *p = str, *endp = str + bytes;
 777   unsigned char *to;
 778
 779   while (p < endp && *p < 0x80) p++;
 780   if (p == endp)
 781     return bytes;
 782   to = p;
 783   bytes = endp - p;
 784   endp = str + len;
 785   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 786   p = endp - bytes;
 787   while (p < endp)
 788     {
 789       int c = *p++;
 790
 791       if (c >= 0x80)
 792         c = BYTE8_TO_CHAR (c);
 793       to += CHAR_STRING (c, to);
 794     }
 795   return (to - str);
 796 }
 797
 798 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 799    actually converts characters in the range 0x80..0xFF to
 800    unibyte.  */
 801
 802 int
 803 str_as_unibyte (str, bytes)
 804      unsigned char *str;
 805      int bytes;
 806 {
 807   const unsigned char *p = str, *endp = str + bytes;
 808   unsigned char *to;
 809   int c, len;
 810
 811   while (p < endp)
 812     {
 813       c = *p;
 814       len = BYTES_BY_CHAR_HEAD (c);
 815       if (CHAR_BYTE8_HEAD_P (c))
 816         break;
 817       p += len;
 818     }
 819   to = str + (p - str);
 820   while (p < endp)
 821     {
 822       c = *p;
 823       len = BYTES_BY_CHAR_HEAD (c);
 824       if (CHAR_BYTE8_HEAD_P (c))
 825         {
 826           c = STRING_CHAR_ADVANCE (p);
 827           *to++ = CHAR_TO_BYTE8 (c);
 828         }
 829       else
 830         {
 831           while (len--) *to++ = *p++;
 832         }
 833     }
 834   return (to - str);
 835 }
 836
 837 /* Convert eight-bit chars in SRC (in multibyte form) to the
 838    corresponding byte and store in DST.  CHARS is the number of
 839    characters in SRC.  The value is the number of bytes stored in DST.
 840    Usually, the value is the same as CHARS, but is less than it if SRC
 841    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 842    is nonzero, a Latin-1 character is accepted and converted to a byte
 843    of that character code.
 844    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 845
 846 EMACS_INT
 847 str_to_unibyte (src, dst, chars, accept_latin_1)
 848      const unsigned char *src;
 849      unsigned char *dst;
 850      EMACS_INT chars;
 851      int accept_latin_1;
 852 {
 853   EMACS_INT i;
 854
 855   for (i = 0; i < chars; i++)
 856     {
 857       int c = STRING_CHAR_ADVANCE (src);
 858
 859       if (CHAR_BYTE8_P (c))
 860         c = CHAR_TO_BYTE8 (c);
 861       else if (! ASCII_CHAR_P (c)
 862                && (! accept_latin_1 || c >= 0x100))
 863         return i;
 864       *dst++ = c;
 865     }
 866   return i;
 867 }
 868
 869
 870 int
 871 string_count_byte8 (string)
 872      Lisp_Object string;
 873 {
 874   int multibyte = STRING_MULTIBYTE (string);
 875   int nbytes = SBYTES (string);
 876   unsigned char *p = SDATA (string);
 877   unsigned char *pend = p + nbytes;
 878   int count = 0;
 879   int c, len;
 880
 881   if (multibyte)
 882     while (p < pend)
 883       {
 884         c = *p;
 885         len = BYTES_BY_CHAR_HEAD (c);
 886
 887         if (CHAR_BYTE8_HEAD_P (c))
 888           count++;
 889         p += len;
 890       }
 891   else
 892     while (p < pend)
 893       {
 894         if (*p++ >= 0x80)
 895           count++;
 896       }
 897   return count;
 898 }
 899
 900
 901 Lisp_Object
 902 string_escape_byte8 (string)
 903      Lisp_Object string;
 904 {
 905   int nchars = SCHARS (string);
 906   int nbytes = SBYTES (string);
 907   int multibyte = STRING_MULTIBYTE (string);
 908   int byte8_count;
 909   const unsigned char *src, *src_end;
 910   unsigned char *dst;
 911   Lisp_Object val;
 912   int c, len;
 913
 914   if (multibyte && nchars == nbytes)
 915     return string;
 916
 917   byte8_count = string_count_byte8 (string);
 918
 919   if (byte8_count == 0)
 920     return string;
 921
 922   if (multibyte)
 923     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 924     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 925                                         nbytes + byte8_count * 2);
 926   else
 927     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 928     val = make_uninit_string (nbytes + byte8_count * 3);
 929
 930   src = SDATA (string);
 931   src_end = src + nbytes;
 932   dst = SDATA (val);
 933   if (multibyte)
 934     while (src < src_end)
 935       {
 936         c = *src;
 937         len = BYTES_BY_CHAR_HEAD (c);
 938
 939         if (CHAR_BYTE8_HEAD_P (c))
 940           {
 941             c = STRING_CHAR_ADVANCE (src);
 942             c = CHAR_TO_BYTE8 (c);
 943             sprintf ((char *) dst, "\\%03o", c);
 944             dst += 4;
 945           }
 946         else
 947           while (len--) *dst++ = *src++;
 948       }
 949   else
 950     while (src < src_end)
 951       {
 952         c = *src++;
 953         if (c >= 0x80)
 954           {
 955             sprintf ((char *) dst, "\\%03o", c);
 956             dst += 4;
 957           }
 958         else
 959           *dst++ = c;
 960       }
 961   return val;
 962 }
 963
 964 \f
 965 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 966        doc: /*
 967 Concatenate all the argument characters and make the result a string.
 968 usage: (string &rest CHARACTERS)  */)
 969      (n, args)
 970      int n;
 971      Lisp_Object *args;
 972 {
 973   int i;
 974   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 975   unsigned char *p = buf;
 976   int c;
 977
 978   for (i = 0; i < n; i++)
 979     {
 980       CHECK_CHARACTER (args[i]);
 981       c = XINT (args[i]);
 982       p += CHAR_STRING (c, p);
 983     }
 984
 985   return make_string_from_bytes ((char *) buf, n, p - buf);
 986 }
 987
 988 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 989        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 990 usage: (unibyte-string &rest BYTES)  */)
 991      (n, args)
 992      int n;
 993      Lisp_Object *args;
 994 {
 995   int i;
 996   unsigned char *buf = (unsigned char *) alloca (n);
 997   unsigned char *p = buf;
 998   unsigned c;
 999
1000   for (i = 0; i < n; i++)
1001     {
1002       CHECK_NATNUM (args[i]);
1003       c = XFASTINT (args[i]);
1004       if (c >= 256)
1005         args_out_of_range_3 (args[i], make_number (0), make_number (255));
1006       *p++ = c;
1007     }
1008
1009   return make_string_from_bytes ((char *) buf, n, p - buf);
1010 }
1011
1012 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
1013        Schar_resolve_modifiers, 1, 1, 0,
1014        doc: /* Resolve modifiers in the character CHAR.
1015 The value is a character with modifiers resolved into the character
1016 code.  Unresolved modifiers are kept in the value.
1017 usage: (char-resolve-modifiers CHAR)  */)
1018      (character)
1019      Lisp_Object character;
1020 {
1021   int c;
1022
1023   CHECK_NUMBER (character);
1024   c = XINT (character);
1025   return make_number (char_resolve_modifier_mask (c));
1026 }
1027
1028 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1029        doc: /* Return a byte value of a character at point.
1030 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1031 a byte value.
1032 Optional 2nd arg STRING, if non-nil, is a string of which first
1033 character is a target to get a byte value.  In this case, POSITION, if
1034 non-nil, is an index of a target character in the string.
1035
1036 If the current buffer (or STRING) is multibyte, and the target
1037 character is not ASCII nor 8-bit character, an error is signalled.  */)
1038      (position, string)
1039      Lisp_Object position, string;
1040 {
1041   int c;
1042   EMACS_INT pos;
1043   unsigned char *p;
1044
1045   if (NILP (string))
1046     {
1047       if (NILP (position))
1048         {
1049           p = PT_ADDR;
1050         }
1051       else
1052         {
1053           CHECK_NUMBER_COERCE_MARKER (position);
1054           if (XINT (position) < BEGV || XINT (position) >= ZV)
1055             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1056           pos = XFASTINT (position);
1057           p = CHAR_POS_ADDR (pos);
1058         }
1059       if (NILP (current_buffer->enable_multibyte_characters))
1060         return make_number (*p);
1061     }
1062   else
1063     {
1064       CHECK_STRING (string);
1065       if (NILP (position))
1066         {
1067           p = SDATA (string);
1068         }
1069       else
1070         {
1071           CHECK_NATNUM (position);
1072           if (XINT (position) >= SCHARS (string))
1073             args_out_of_range (string, position);
1074           pos = XFASTINT (position);
1075           p = SDATA (string) + string_char_to_byte (string, pos);
1076         }
1077       if (! STRING_MULTIBYTE (string))
1078         return make_number (*p);
1079     }
1080   c = STRING_CHAR (p, 0);
1081   if (CHAR_BYTE8_P (c))
1082     c = CHAR_TO_BYTE8 (c);
1083   else if (! ASCII_CHAR_P (c))
1084     error ("Not an ASCII nor an 8-bit character: %d", c);
1085   return make_number (c);
1086 }
1087
1088
1089 void
1090 init_character_once ()
1091 {
1092 }
1093
1094 #ifdef emacs
1095
1096 void
1097 syms_of_character ()
1098 {
1099   DEFSYM (Qcharacterp, "characterp");
1100   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1101
1102   staticpro (&Vchar_unify_table);
1103   Vchar_unify_table = Qnil;
1104
1105   defsubr (&Smax_char);
1106   defsubr (&Scharacterp);
1107   defsubr (&Sunibyte_char_to_multibyte);
1108   defsubr (&Smultibyte_char_to_unibyte);
1109   defsubr (&Schar_bytes);
1110   defsubr (&Schar_width);
1111   defsubr (&Sstring_width);
1112   defsubr (&Schar_direction);
1113   defsubr (&Sstring);
1114   defsubr (&Sunibyte_string);
1115   defsubr (&Schar_resolve_modifiers);
1116   defsubr (&Sget_byte);
1117
1118   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1119                doc: /*
1120 Vector recording all translation tables ever defined.
1121 Each element is a pair (SYMBOL . TABLE) relating the table to the
1122 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1123   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1124
1125   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1126                doc: /*
1127 A char-table for characters which invoke auto-filling.
1128 Such characters have value t in this table.  */);
1129   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1130   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1131   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1132
1133   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1134                doc: /*
1135 A char-table for width (columns) of each character.  */);
1136   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1137   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1138   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1139                         make_number (4));
1140
1141   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1142                doc: /* A char-table for direction of each character.  */);
1143   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1144
1145   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1146                doc: /* A char-table for each printable character.  */);
1147   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1148   Fset_char_table_range (Vprintable_chars,
1149                          Fcons (make_number (32), make_number (126)), Qt);
1150   Fset_char_table_range (Vprintable_chars,
1151                          Fcons (make_number (160),
1152                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1153
1154   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1155                doc: /* Char table of script symbols.
1156 It has one extra slot whose value is a list of script symbols.  */);
1157
1158   /* Intern this now in case it isn't already done.
1159      Setting this variable twice is harmless.
1160      But don't staticpro it here--that is done in alloc.c.  */
1161   Qchar_table_extra_slots = intern ("char-table-extra-slots");
1162   DEFSYM (Qchar_script_table, "char-script-table");
1163   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1164   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1165
1166   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1167                doc: /* Alist of scripts vs the representative characters.
1168 Each element is a cons (SCRIPT . CHARS).
1169 SCRIPT is a symbol representing a script or a subgroup of a script.
1170 CHARS is a list or a vector of characters.
1171 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1172 If it is a vector, one of the characters in the vector is necessary.
1173 This variable is used to find a font for a specific script.  */);
1174   Vscript_representative_chars = Qnil;
1175
1176   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1177                doc: /* Char table of Unicode's "General Category".
1178 All Unicode characters have one of the following values (symbol):
1179   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1180   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1181 See The Unicode Standard for the meaning of those values.  */);
1182   /* The correct char-table is setup in characters.el.  */
1183   Vunicode_category_table = Qnil;
1184 }
1185
1186 #endif /* emacs */
1187
1188 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1189    (do not change this comment) */