src/character.c

   1 /* Basic character support.
   2    Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   3      Licensed to the Free Software Foundation.
   4    Copyright (C) 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   5      Free Software Foundation, Inc.
   6    Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010
   7      National Institute of Advanced Industrial Science and Technology (AIST)
   8      Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include "lisp.h"
  39 #include "character.h"
  40 #include "buffer.h"
  41 #include "charset.h"
  42 #include "composite.h"
  43 #include "disptab.h"
  44
  45 #else  /* not emacs */
  46
  47 #include "mulelib.h"
  48
  49 #endif /* emacs */
  50
  51 Lisp_Object Qcharacterp;
  52
  53 /* Vector of translation table ever defined.
  54    ID of a translation table is used to index this vector.  */
  55 Lisp_Object Vtranslation_table_vector;
  56
  57 /* A char-table for characters which may invoke auto-filling.  */
  58 Lisp_Object Vauto_fill_chars;
  59
  60 Lisp_Object Qauto_fill_chars;
  61
  62 /* Char-table of information about which character to unify to which
  63    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  64 Lisp_Object Vchar_unify_table;
  65
  66 /* A char-table.  An element is non-nil iff the corresponding
  67    character has a printable glyph.  */
  68 Lisp_Object Vprintable_chars;
  69
  70 /* A char-table.  An elemnent is a column-width of the corresponding
  71    character.  */
  72 Lisp_Object Vchar_width_table;
  73
  74 /* A char-table.  An element is a symbol indicating the direction
  75    property of corresponding character.  */
  76 Lisp_Object Vchar_direction_table;
  77
  78 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  79 unsigned char *_fetch_multibyte_char_p;
  80
  81 /* Char table of scripts.  */
  82 Lisp_Object Vchar_script_table;
  83
  84 /* Alist of scripts vs representative characters.  */
  85 Lisp_Object Vscript_representative_chars;
  86
  87 static Lisp_Object Qchar_script_table;
  88
  89 Lisp_Object Vunicode_category_table;
  90 \f
  91
  92 /* If character code C has modifier masks, reflect them to the
  93    character code if possible.  Return the resulting code.  */
  94
  95 int
  96 char_resolve_modifier_mask (c)
  97      int c;
  98 {
  99   /* A non-ASCII character can't reflect modifier bits to the code.  */
 100   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
 101     return c;
 102
 103   /* For Meta, Shift, and Control modifiers, we need special care.  */
 104   if (c & CHAR_SHIFT)
 105     {
 106       /* Shift modifier is valid only with [A-Za-z].  */
 107       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
 108         c &= ~CHAR_SHIFT;
 109       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
 110         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
 111       /* Shift modifier for control characters and SPC is ignored.  */
 112       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
 113         c &= ~CHAR_SHIFT;
 114     }
 115   if (c & CHAR_CTL)
 116     {
 117       /* Simulate the code in lread.c.  */
 118       /* Allow `\C- ' and `\C-?'.  */
 119       if ((c & 0377) == ' ')
 120         c &= ~0177 & ~ CHAR_CTL;
 121       else if ((c & 0377) == '?')
 122         c = 0177 | (c & ~0177 & ~CHAR_CTL);
 123       /* ASCII control chars are made from letters (both cases),
 124          as well as the non-letters within 0100...0137.  */
 125       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 126         c &= (037 | (~0177 & ~CHAR_CTL));
 127       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 128         c &= (037 | (~0177 & ~CHAR_CTL));
 129     }
 130 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 131   if (c & CHAR_META)
 132     {
 133       /* Move the meta bit to the right place for a string.  */
 134       c = (c & ~CHAR_META) | 0x80;
 135     }
 136 #endif
 137
 138   return c;
 139 }
 140
 141
 142 /* Store multibyte form of character C at P.  If C has modifier bits,
 143    handle them appropriately.  */
 144
 145 int
 146 char_string (c, p)
 147      unsigned c;
 148      unsigned char *p;
 149 {
 150   int bytes;
 151
 152   if (c & CHAR_MODIFIER_MASK)
 153     {
 154       c = (unsigned) char_resolve_modifier_mask ((int) c);
 155       /* If C still has any modifier bits, just ignore it.  */
 156       c &= ~CHAR_MODIFIER_MASK;
 157     }
 158
 159   MAYBE_UNIFY_CHAR (c);
 160
 161   if (c <= MAX_3_BYTE_CHAR)
 162     {
 163       bytes = CHAR_STRING (c, p);
 164     }
 165   else if (c <= MAX_4_BYTE_CHAR)
 166     {
 167       p[0] = (0xF0 | (c >> 18));
 168       p[1] = (0x80 | ((c >> 12) & 0x3F));
 169       p[2] = (0x80 | ((c >> 6) & 0x3F));
 170       p[3] = (0x80 | (c & 0x3F));
 171       bytes = 4;
 172     }
 173   else if (c <= MAX_5_BYTE_CHAR)
 174     {
 175       p[0] = 0xF8;
 176       p[1] = (0x80 | ((c >> 18) & 0x0F));
 177       p[2] = (0x80 | ((c >> 12) & 0x3F));
 178       p[3] = (0x80 | ((c >> 6) & 0x3F));
 179       p[4] = (0x80 | (c & 0x3F));
 180       bytes = 5;
 181     }
 182   else if (c <= MAX_CHAR)
 183     {
 184       c = CHAR_TO_BYTE8 (c);
 185       bytes = BYTE8_STRING (c, p);
 186     }
 187   else
 188     error ("Invalid character: %d", c);
 189
 190   return bytes;
 191 }
 192
 193
 194 /* Return a character whose multibyte form is at P.  Set LEN is not
 195    NULL, it must be a pointer to integer.  In that case, set *LEN to
 196    the byte length of the multibyte form.  If ADVANCED is not NULL, is
 197    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 198    the ending address (i.e. the starting address of the next
 199    character) of the multibyte form.  */
 200
 201 int
 202 string_char (p, advanced, len)
 203      const unsigned char *p;
 204      const unsigned char **advanced;
 205      int *len;
 206 {
 207   int c;
 208   const unsigned char *saved_p = p;
 209
 210   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 211     {
 212       c = STRING_CHAR_ADVANCE (p);
 213     }
 214   else if (! (*p & 0x08))
 215     {
 216       c = ((((p)[0] & 0xF) << 18)
 217            | (((p)[1] & 0x3F) << 12)
 218            | (((p)[2] & 0x3F) << 6)
 219            | ((p)[3] & 0x3F));
 220       p += 4;
 221     }
 222   else
 223     {
 224       c = ((((p)[1] & 0x3F) << 18)
 225            | (((p)[2] & 0x3F) << 12)
 226            | (((p)[3] & 0x3F) << 6)
 227            | ((p)[4] & 0x3F));
 228       p += 5;
 229     }
 230
 231   MAYBE_UNIFY_CHAR (c);
 232
 233   if (len)
 234     *len = p - saved_p;
 235   if (advanced)
 236     *advanced = p;
 237   return c;
 238 }
 239
 240
 241 /* Translate character C by translation table TABLE.  If C is
 242    negative, translate a character specified by CHARSET and CODE.  If
 243    no translation is found in TABLE, return the untranslated
 244    character.  If TABLE is a list, elements are char tables.  In this
 245    case, translace C by all tables.  */
 246
 247 int
 248 translate_char (table, c)
 249      Lisp_Object table;
 250      int c;
 251 {
 252   if (CHAR_TABLE_P (table))
 253     {
 254       Lisp_Object ch;
 255
 256       ch = CHAR_TABLE_REF (table, c);
 257       if (CHARACTERP (ch))
 258         c = XINT (ch);
 259     }
 260   else
 261     {
 262       for (; CONSP (table); table = XCDR (table))
 263         c = translate_char (XCAR (table), c);
 264     }
 265   return c;
 266 }
 267
 268 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 269    them, return (C & 0xFF).
 270
 271    The argument REV_TBL is now ignored.  It will be removed in the
 272    future.  */
 273
 274 int
 275 multibyte_char_to_unibyte (c, rev_tbl)
 276      int c;
 277      Lisp_Object rev_tbl;
 278 {
 279   if (c < 0x80)
 280     return c;
 281   if (CHAR_BYTE8_P (c))
 282     return CHAR_TO_BYTE8 (c);
 283   return (c & 0xFF);
 284 }
 285
 286 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 287    by charset_unibyte.  */
 288
 289 int
 290 multibyte_char_to_unibyte_safe (c)
 291      int c;
 292 {
 293   if (c < 0x80)
 294     return c;
 295   if (CHAR_BYTE8_P (c))
 296     return CHAR_TO_BYTE8 (c);
 297   return -1;
 298 }
 299
 300 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 301        doc: /* Return non-nil if OBJECT is a character.  */)
 302      (object, ignore)
 303      Lisp_Object object, ignore;
 304 {
 305   return (CHARACTERP (object) ? Qt : Qnil);
 306 }
 307
 308 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 309        doc: /* Return the character of the maximum code.  */)
 310      ()
 311 {
 312   return make_number (MAX_CHAR);
 313 }
 314
 315 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 316        Sunibyte_char_to_multibyte, 1, 1, 0,
 317        doc: /* Convert the byte CH to multibyte character.  */)
 318      (ch)
 319      Lisp_Object ch;
 320 {
 321   int c;
 322
 323   CHECK_CHARACTER (ch);
 324   c = XFASTINT (ch);
 325   if (c >= 0x100)
 326     error ("Not a unibyte character: %d", c);
 327   MAKE_CHAR_MULTIBYTE (c);
 328   return make_number (c);
 329 }
 330
 331 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 332        Smultibyte_char_to_unibyte, 1, 1, 0,
 333        doc: /* Convert the multibyte character CH to a byte.
 334 If the multibyte character does not represent a byte, return -1.  */)
 335      (ch)
 336      Lisp_Object ch;
 337 {
 338   int cm;
 339
 340   CHECK_CHARACTER (ch);
 341   cm = XFASTINT (ch);
 342   if (cm < 256)
 343     /* Can't distinguish a byte read from a unibyte buffer from
 344        a latin1 char, so let's let it slide.  */
 345     return ch;
 346   else
 347     {
 348       int cu = CHAR_TO_BYTE_SAFE (cm);
 349       return make_number (cu);
 350     }
 351 }
 352
 353 DEFUN ("char-bytes", Fchar_bytes, Schar_bytes, 1, 1, 0,
 354        doc: /* Return 1 regardless of the argument CHAR.
 355 This is now an obsolete function.  We keep it just for backward compatibility.
 356 usage: (char-bytes CHAR)  */)
 357      (ch)
 358      Lisp_Object ch;
 359 {
 360   CHECK_CHARACTER (ch);
 361   return make_number (1);
 362 }
 363
 364 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 365        doc: /* Return width of CHAR when displayed in the current buffer.
 366 The width is measured by how many columns it occupies on the screen.
 367 Tab is taken to occupy `tab-width' columns.
 368 usage: (char-width CHAR)  */)
 369      (ch)
 370        Lisp_Object ch;
 371 {
 372   Lisp_Object disp;
 373   int c, width;
 374   struct Lisp_Char_Table *dp = buffer_display_table ();
 375
 376   CHECK_CHARACTER (ch);
 377   c = XINT (ch);
 378
 379   /* Get the way the display table would display it.  */
 380   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 381
 382   if (VECTORP (disp))
 383     width = ASIZE (disp);
 384   else
 385     width = CHAR_WIDTH (c);
 386
 387   return make_number (width);
 388 }
 389
 390 /* Return width of string STR of length LEN when displayed in the
 391    current buffer.  The width is measured by how many columns it
 392    occupies on the screen.  If PRECISION > 0, return the width of
 393    longest substring that doesn't exceed PRECISION, and set number of
 394    characters and bytes of the substring in *NCHARS and *NBYTES
 395    respectively.  */
 396
 397 int
 398 c_string_width (const unsigned char *str, int len, int precision, int *nchars, int *nbytes)
 399 {
 400   int i = 0, i_byte = 0;
 401   int width = 0;
 402   struct Lisp_Char_Table *dp = buffer_display_table ();
 403
 404   while (i_byte < len)
 405     {
 406       int bytes, thiswidth;
 407       Lisp_Object val;
 408       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 409
 410       if (dp)
 411         {
 412           val = DISP_CHAR_VECTOR (dp, c);
 413           if (VECTORP (val))
 414             thiswidth = XVECTOR (val)->size;
 415           else
 416             thiswidth = CHAR_WIDTH (c);
 417         }
 418       else
 419         {
 420           thiswidth = CHAR_WIDTH (c);
 421         }
 422
 423       if (precision > 0
 424           && (width + thiswidth > precision))
 425         {
 426           *nchars = i;
 427           *nbytes = i_byte;
 428           return width;
 429         }
 430       i++;
 431       i_byte += bytes;
 432       width += thiswidth;
 433   }
 434
 435   if (precision > 0)
 436     {
 437       *nchars = i;
 438       *nbytes = i_byte;
 439     }
 440
 441   return width;
 442 }
 443
 444 /* Return width of string STR of length LEN when displayed in the
 445    current buffer.  The width is measured by how many columns it
 446    occupies on the screen.  */
 447
 448 int
 449 strwidth (str, len)
 450      unsigned char *str;
 451      int len;
 452 {
 453   return c_string_width (str, len, -1, NULL, NULL);
 454 }
 455
 456 /* Return width of Lisp string STRING when displayed in the current
 457    buffer.  The width is measured by how many columns it occupies on
 458    the screen while paying attention to compositions.  If PRECISION >
 459    0, return the width of longest substring that doesn't exceed
 460    PRECISION, and set number of characters and bytes of the substring
 461    in *NCHARS and *NBYTES respectively.  */
 462
 463 int
 464 lisp_string_width (string, precision, nchars, nbytes)
 465      Lisp_Object string;
 466      int precision, *nchars, *nbytes;
 467 {
 468   int len = SCHARS (string);
 469   /* This set multibyte to 0 even if STRING is multibyte when it
 470      contains only ascii and eight-bit-graphic, but that's
 471      intentional.  */
 472   int multibyte = len < SBYTES (string);
 473   unsigned char *str = SDATA (string);
 474   int i = 0, i_byte = 0;
 475   int width = 0;
 476   struct Lisp_Char_Table *dp = buffer_display_table ();
 477
 478   while (i < len)
 479     {
 480       int chars, bytes, thiswidth;
 481       Lisp_Object val;
 482       int cmp_id;
 483       EMACS_INT ignore, end;
 484
 485       if (find_composition (i, -1, &ignore, &end, &val, string)
 486           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 487               >= 0))
 488         {
 489           thiswidth = composition_table[cmp_id]->width;
 490           chars = end - i;
 491           bytes = string_char_to_byte (string, end) - i_byte;
 492         }
 493       else
 494         {
 495           int c;
 496
 497           if (multibyte)
 498             c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 499           else
 500             c = str[i_byte], bytes = 1;
 501           chars = 1;
 502           if (dp)
 503             {
 504               val = DISP_CHAR_VECTOR (dp, c);
 505               if (VECTORP (val))
 506                 thiswidth = XVECTOR (val)->size;
 507               else
 508                 thiswidth = CHAR_WIDTH (c);
 509             }
 510           else
 511             {
 512               thiswidth = CHAR_WIDTH (c);
 513             }
 514         }
 515
 516       if (precision > 0
 517           && (width + thiswidth > precision))
 518         {
 519           *nchars = i;
 520           *nbytes = i_byte;
 521           return width;
 522         }
 523       i += chars;
 524       i_byte += bytes;
 525       width += thiswidth;
 526   }
 527
 528   if (precision > 0)
 529     {
 530       *nchars = i;
 531       *nbytes = i_byte;
 532     }
 533
 534   return width;
 535 }
 536
 537 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 538        doc: /* Return width of STRING when displayed in the current buffer.
 539 Width is measured by how many columns it occupies on the screen.
 540 When calculating width of a multibyte character in STRING,
 541 only the base leading-code is considered; the validity of
 542 the following bytes is not checked.  Tabs in STRING are always
 543 taken to occupy `tab-width' columns.
 544 usage: (string-width STRING)  */)
 545      (str)
 546      Lisp_Object str;
 547 {
 548   Lisp_Object val;
 549
 550   CHECK_STRING (str);
 551   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 552   return val;
 553 }
 554
 555 DEFUN ("char-direction", Fchar_direction, Schar_direction, 1, 1, 0,
 556        doc: /* Return the direction of CHAR.
 557 The returned value is 0 for left-to-right and 1 for right-to-left.
 558 usage: (char-direction CHAR)  */)
 559      (ch)
 560      Lisp_Object ch;
 561 {
 562   int c;
 563
 564   CHECK_CHARACTER (ch);
 565   c = XINT (ch);
 566   return CHAR_TABLE_REF (Vchar_direction_table, c);
 567 }
 568
 569 /* Return the number of characters in the NBYTES bytes at PTR.
 570    This works by looking at the contents and checking for multibyte
 571    sequences while assuming that there's no invalid sequence.
 572    However, if the current buffer has enable-multibyte-characters =
 573    nil, we treat each byte as a character.  */
 574
 575 EMACS_INT
 576 chars_in_text (ptr, nbytes)
 577      const unsigned char *ptr;
 578      EMACS_INT nbytes;
 579 {
 580   /* current_buffer is null at early stages of Emacs initialization.  */
 581   if (current_buffer == 0
 582       || NILP (current_buffer->enable_multibyte_characters))
 583     return nbytes;
 584
 585   return multibyte_chars_in_text (ptr, nbytes);
 586 }
 587
 588 /* Return the number of characters in the NBYTES bytes at PTR.
 589    This works by looking at the contents and checking for multibyte
 590    sequences while assuming that there's no invalid sequence.  It
 591    ignores enable-multibyte-characters.  */
 592
 593 EMACS_INT
 594 multibyte_chars_in_text (ptr, nbytes)
 595      const unsigned char *ptr;
 596      EMACS_INT nbytes;
 597 {
 598   const unsigned char *endp = ptr + nbytes;
 599   int chars = 0;
 600
 601   while (ptr < endp)
 602     {
 603       int len = MULTIBYTE_LENGTH (ptr, endp);
 604
 605       if (len == 0)
 606         abort ();
 607       ptr += len;
 608       chars++;
 609     }
 610
 611   return chars;
 612 }
 613
 614 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 615    characters and bytes in it, and store them in *NCHARS and *NBYTES
 616    respectively.  On counting bytes, pay attention to that 8-bit
 617    characters not constructing a valid multibyte sequence are
 618    represented by 2-byte in a multibyte text.  */
 619
 620 void
 621 parse_str_as_multibyte (str, len, nchars, nbytes)
 622      const unsigned char *str;
 623      int len, *nchars, *nbytes;
 624 {
 625   const unsigned char *endp = str + len;
 626   int n, chars = 0, bytes = 0;
 627
 628   if (len >= MAX_MULTIBYTE_LENGTH)
 629     {
 630       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 631       while (str < adjusted_endp)
 632         {
 633           if ((n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 634             str += n, bytes += n;
 635           else
 636             str++, bytes += 2;
 637           chars++;
 638         }
 639     }
 640   while (str < endp)
 641     {
 642       if ((n = MULTIBYTE_LENGTH (str, endp)) > 0)
 643         str += n, bytes += n;
 644       else
 645         str++, bytes += 2;
 646       chars++;
 647     }
 648
 649   *nchars = chars;
 650   *nbytes = bytes;
 651   return;
 652 }
 653
 654 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 655    It actually converts only such 8-bit characters that don't contruct
 656    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 657    NCHARS is nonzero, set *NCHARS to the number of characters in the
 658    text.  It is assured that we can use LEN bytes at STR as a work
 659    area and that is enough.  Return the number of bytes of the
 660    resulting text.  */
 661
 662 int
 663 str_as_multibyte (str, len, nbytes, nchars)
 664      unsigned char *str;
 665      int len, nbytes, *nchars;
 666 {
 667   unsigned char *p = str, *endp = str + nbytes;
 668   unsigned char *to;
 669   int chars = 0;
 670   int n;
 671
 672   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 673     {
 674       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 675       while (p < adjusted_endp
 676              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 677         p += n, chars++;
 678     }
 679   while ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 680     p += n, chars++;
 681   if (nchars)
 682     *nchars = chars;
 683   if (p == endp)
 684     return nbytes;
 685
 686   to = p;
 687   nbytes = endp - p;
 688   endp = str + len;
 689   safe_bcopy ((char *) p, (char *) (endp - nbytes), nbytes);
 690   p = endp - nbytes;
 691
 692   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 693     {
 694       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 695       while (p < adjusted_endp)
 696         {
 697           if ((n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 698             {
 699               while (n--)
 700                 *to++ = *p++;
 701             }
 702           else
 703             {
 704               int c = *p++;
 705               c = BYTE8_TO_CHAR (c);
 706               to += CHAR_STRING (c, to);
 707             }
 708         }
 709       chars++;
 710     }
 711   while (p < endp)
 712     {
 713       if ((n = MULTIBYTE_LENGTH (p, endp)) > 0)
 714         {
 715           while (n--)
 716             *to++ = *p++;
 717         }
 718       else
 719         {
 720           int c = *p++;
 721           c = BYTE8_TO_CHAR (c);
 722           to += CHAR_STRING (c, to);
 723         }
 724       chars++;
 725     }
 726   if (nchars)
 727     *nchars = chars;
 728   return (to - str);
 729 }
 730
 731 /* Parse unibyte string at STR of LEN bytes, and return the number of
 732    bytes it may ocupy when converted to multibyte string by
 733    `str_to_multibyte'.  */
 734
 735 int
 736 parse_str_to_multibyte (str, len)
 737      unsigned char *str;
 738      int len;
 739 {
 740   unsigned char *endp = str + len;
 741   int bytes;
 742
 743   for (bytes = 0; str < endp; str++)
 744     bytes += (*str < 0x80) ? 1 : 2;
 745   return bytes;
 746 }
 747
 748
 749 /* Convert unibyte text at STR of NBYTES bytes to a multibyte text
 750    that contains the same single-byte characters.  It actually
 751    converts all 8-bit characters to multibyte forms.  It is assured
 752    that we can use LEN bytes at STR as a work area and that is
 753    enough.  */
 754
 755 int
 756 str_to_multibyte (str, len, bytes)
 757      unsigned char *str;
 758      int len, bytes;
 759 {
 760   unsigned char *p = str, *endp = str + bytes;
 761   unsigned char *to;
 762
 763   while (p < endp && *p < 0x80) p++;
 764   if (p == endp)
 765     return bytes;
 766   to = p;
 767   bytes = endp - p;
 768   endp = str + len;
 769   safe_bcopy ((char *) p, (char *) (endp - bytes), bytes);
 770   p = endp - bytes;
 771   while (p < endp)
 772     {
 773       int c = *p++;
 774
 775       if (c >= 0x80)
 776         c = BYTE8_TO_CHAR (c);
 777       to += CHAR_STRING (c, to);
 778     }
 779   return (to - str);
 780 }
 781
 782 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 783    actually converts characters in the range 0x80..0xFF to
 784    unibyte.  */
 785
 786 int
 787 str_as_unibyte (str, bytes)
 788      unsigned char *str;
 789      int bytes;
 790 {
 791   const unsigned char *p = str, *endp = str + bytes;
 792   unsigned char *to;
 793   int c, len;
 794
 795   while (p < endp)
 796     {
 797       c = *p;
 798       len = BYTES_BY_CHAR_HEAD (c);
 799       if (CHAR_BYTE8_HEAD_P (c))
 800         break;
 801       p += len;
 802     }
 803   to = str + (p - str);
 804   while (p < endp)
 805     {
 806       c = *p;
 807       len = BYTES_BY_CHAR_HEAD (c);
 808       if (CHAR_BYTE8_HEAD_P (c))
 809         {
 810           c = STRING_CHAR_ADVANCE (p);
 811           *to++ = CHAR_TO_BYTE8 (c);
 812         }
 813       else
 814         {
 815           while (len--) *to++ = *p++;
 816         }
 817     }
 818   return (to - str);
 819 }
 820
 821 /* Convert eight-bit chars in SRC (in multibyte form) to the
 822    corresponding byte and store in DST.  CHARS is the number of
 823    characters in SRC.  The value is the number of bytes stored in DST.
 824    Usually, the value is the same as CHARS, but is less than it if SRC
 825    contains a non-ASCII, non-eight-bit characater.  If ACCEPT_LATIN_1
 826    is nonzero, a Latin-1 character is accepted and converted to a byte
 827    of that character code.
 828    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 829
 830 EMACS_INT
 831 str_to_unibyte (src, dst, chars, accept_latin_1)
 832      const unsigned char *src;
 833      unsigned char *dst;
 834      EMACS_INT chars;
 835      int accept_latin_1;
 836 {
 837   EMACS_INT i;
 838
 839   for (i = 0; i < chars; i++)
 840     {
 841       int c = STRING_CHAR_ADVANCE (src);
 842
 843       if (CHAR_BYTE8_P (c))
 844         c = CHAR_TO_BYTE8 (c);
 845       else if (! ASCII_CHAR_P (c)
 846                && (! accept_latin_1 || c >= 0x100))
 847         return i;
 848       *dst++ = c;
 849     }
 850   return i;
 851 }
 852
 853
 854 int
 855 string_count_byte8 (string)
 856      Lisp_Object string;
 857 {
 858   int multibyte = STRING_MULTIBYTE (string);
 859   int nbytes = SBYTES (string);
 860   unsigned char *p = SDATA (string);
 861   unsigned char *pend = p + nbytes;
 862   int count = 0;
 863   int c, len;
 864
 865   if (multibyte)
 866     while (p < pend)
 867       {
 868         c = *p;
 869         len = BYTES_BY_CHAR_HEAD (c);
 870
 871         if (CHAR_BYTE8_HEAD_P (c))
 872           count++;
 873         p += len;
 874       }
 875   else
 876     while (p < pend)
 877       {
 878         if (*p++ >= 0x80)
 879           count++;
 880       }
 881   return count;
 882 }
 883
 884
 885 Lisp_Object
 886 string_escape_byte8 (string)
 887      Lisp_Object string;
 888 {
 889   int nchars = SCHARS (string);
 890   int nbytes = SBYTES (string);
 891   int multibyte = STRING_MULTIBYTE (string);
 892   int byte8_count;
 893   const unsigned char *src, *src_end;
 894   unsigned char *dst;
 895   Lisp_Object val;
 896   int c, len;
 897
 898   if (multibyte && nchars == nbytes)
 899     return string;
 900
 901   byte8_count = string_count_byte8 (string);
 902
 903   if (byte8_count == 0)
 904     return string;
 905
 906   if (multibyte)
 907     /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 908     val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 909                                         nbytes + byte8_count * 2);
 910   else
 911     /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 912     val = make_uninit_string (nbytes + byte8_count * 3);
 913
 914   src = SDATA (string);
 915   src_end = src + nbytes;
 916   dst = SDATA (val);
 917   if (multibyte)
 918     while (src < src_end)
 919       {
 920         c = *src;
 921         len = BYTES_BY_CHAR_HEAD (c);
 922
 923         if (CHAR_BYTE8_HEAD_P (c))
 924           {
 925             c = STRING_CHAR_ADVANCE (src);
 926             c = CHAR_TO_BYTE8 (c);
 927             sprintf ((char *) dst, "\\%03o", c);
 928             dst += 4;
 929           }
 930         else
 931           while (len--) *dst++ = *src++;
 932       }
 933   else
 934     while (src < src_end)
 935       {
 936         c = *src++;
 937         if (c >= 0x80)
 938           {
 939             sprintf ((char *) dst, "\\%03o", c);
 940             dst += 4;
 941           }
 942         else
 943           *dst++ = c;
 944       }
 945   return val;
 946 }
 947
 948 \f
 949 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 950        doc: /*
 951 Concatenate all the argument characters and make the result a string.
 952 usage: (string &rest CHARACTERS)  */)
 953      (n, args)
 954      int n;
 955      Lisp_Object *args;
 956 {
 957   int i;
 958   unsigned char *buf = (unsigned char *) alloca (MAX_MULTIBYTE_LENGTH * n);
 959   unsigned char *p = buf;
 960   int c;
 961
 962   for (i = 0; i < n; i++)
 963     {
 964       CHECK_CHARACTER (args[i]);
 965       c = XINT (args[i]);
 966       p += CHAR_STRING (c, p);
 967     }
 968
 969   return make_string_from_bytes ((char *) buf, n, p - buf);
 970 }
 971
 972 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 973        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 974 usage: (unibyte-string &rest BYTES)  */)
 975      (n, args)
 976      int n;
 977      Lisp_Object *args;
 978 {
 979   int i;
 980   unsigned char *buf = (unsigned char *) alloca (n);
 981   unsigned char *p = buf;
 982   unsigned c;
 983
 984   for (i = 0; i < n; i++)
 985     {
 986       CHECK_NATNUM (args[i]);
 987       c = XFASTINT (args[i]);
 988       if (c >= 256)
 989         args_out_of_range_3 (args[i], make_number (0), make_number (255));
 990       *p++ = c;
 991     }
 992
 993   return make_string_from_bytes ((char *) buf, n, p - buf);
 994 }
 995
 996 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 997        Schar_resolve_modifiers, 1, 1, 0,
 998        doc: /* Resolve modifiers in the character CHAR.
 999 The value is a character with modifiers resolved into the character
1000 code.  Unresolved modifiers are kept in the value.
1001 usage: (char-resolve-modifiers CHAR)  */)
1002      (character)
1003      Lisp_Object character;
1004 {
1005   int c;
1006
1007   CHECK_NUMBER (character);
1008   c = XINT (character);
1009   return make_number (char_resolve_modifier_mask (c));
1010 }
1011
1012 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
1013        doc: /* Return a byte value of a character at point.
1014 Optional 1st arg POSITION, if non-nil, is a position of a character to get
1015 a byte value.
1016 Optional 2nd arg STRING, if non-nil, is a string of which first
1017 character is a target to get a byte value.  In this case, POSITION, if
1018 non-nil, is an index of a target character in the string.
1019
1020 If the current buffer (or STRING) is multibyte, and the target
1021 character is not ASCII nor 8-bit character, an error is signalled.  */)
1022      (position, string)
1023      Lisp_Object position, string;
1024 {
1025   int c;
1026   EMACS_INT pos;
1027   unsigned char *p;
1028
1029   if (NILP (string))
1030     {
1031       if (NILP (position))
1032         {
1033           p = PT_ADDR;
1034         }
1035       else
1036         {
1037           CHECK_NUMBER_COERCE_MARKER (position);
1038           if (XINT (position) < BEGV || XINT (position) >= ZV)
1039             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
1040           pos = XFASTINT (position);
1041           p = CHAR_POS_ADDR (pos);
1042         }
1043       if (NILP (current_buffer->enable_multibyte_characters))
1044         return make_number (*p);
1045     }
1046   else
1047     {
1048       CHECK_STRING (string);
1049       if (NILP (position))
1050         {
1051           p = SDATA (string);
1052         }
1053       else
1054         {
1055           CHECK_NATNUM (position);
1056           if (XINT (position) >= SCHARS (string))
1057             args_out_of_range (string, position);
1058           pos = XFASTINT (position);
1059           p = SDATA (string) + string_char_to_byte (string, pos);
1060         }
1061       if (! STRING_MULTIBYTE (string))
1062         return make_number (*p);
1063     }
1064   c = STRING_CHAR (p);
1065   if (CHAR_BYTE8_P (c))
1066     c = CHAR_TO_BYTE8 (c);
1067   else if (! ASCII_CHAR_P (c))
1068     error ("Not an ASCII nor an 8-bit character: %d", c);
1069   return make_number (c);
1070 }
1071
1072
1073 void
1074 init_character_once ()
1075 {
1076 }
1077
1078 #ifdef emacs
1079
1080 void
1081 syms_of_character ()
1082 {
1083   DEFSYM (Qcharacterp, "characterp");
1084   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1085
1086   staticpro (&Vchar_unify_table);
1087   Vchar_unify_table = Qnil;
1088
1089   defsubr (&Smax_char);
1090   defsubr (&Scharacterp);
1091   defsubr (&Sunibyte_char_to_multibyte);
1092   defsubr (&Smultibyte_char_to_unibyte);
1093   defsubr (&Schar_bytes);
1094   defsubr (&Schar_width);
1095   defsubr (&Sstring_width);
1096   defsubr (&Schar_direction);
1097   defsubr (&Sstring);
1098   defsubr (&Sunibyte_string);
1099   defsubr (&Schar_resolve_modifiers);
1100   defsubr (&Sget_byte);
1101
1102   DEFVAR_LISP ("translation-table-vector",  &Vtranslation_table_vector,
1103                doc: /*
1104 Vector recording all translation tables ever defined.
1105 Each element is a pair (SYMBOL . TABLE) relating the table to the
1106 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1107   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1108
1109   DEFVAR_LISP ("auto-fill-chars", &Vauto_fill_chars,
1110                doc: /*
1111 A char-table for characters which invoke auto-filling.
1112 Such characters have value t in this table.  */);
1113   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1114   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1115   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1116
1117   DEFVAR_LISP ("char-width-table", &Vchar_width_table,
1118                doc: /*
1119 A char-table for width (columns) of each character.  */);
1120   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1121   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1122   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1123                         make_number (4));
1124
1125   DEFVAR_LISP ("char-direction-table", &Vchar_direction_table,
1126                doc: /* A char-table for direction of each character.  */);
1127   Vchar_direction_table = Fmake_char_table (Qnil, make_number (1));
1128
1129   DEFVAR_LISP ("printable-chars", &Vprintable_chars,
1130                doc: /* A char-table for each printable character.  */);
1131   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1132   Fset_char_table_range (Vprintable_chars,
1133                          Fcons (make_number (32), make_number (126)), Qt);
1134   Fset_char_table_range (Vprintable_chars,
1135                          Fcons (make_number (160),
1136                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1137
1138   DEFVAR_LISP ("char-script-table", &Vchar_script_table,
1139                doc: /* Char table of script symbols.
1140 It has one extra slot whose value is a list of script symbols.  */);
1141
1142   /* Intern this now in case it isn't already done.
1143      Setting this variable twice is harmless.
1144      But don't staticpro it here--that is done in alloc.c.  */
1145   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1146   DEFSYM (Qchar_script_table, "char-script-table");
1147   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1148   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1149
1150   DEFVAR_LISP ("script-representative-chars", &Vscript_representative_chars,
1151                doc: /* Alist of scripts vs the representative characters.
1152 Each element is a cons (SCRIPT . CHARS).
1153 SCRIPT is a symbol representing a script or a subgroup of a script.
1154 CHARS is a list or a vector of characters.
1155 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1156 If it is a vector, one of the characters in the vector is necessary.
1157 This variable is used to find a font for a specific script.  */);
1158   Vscript_representative_chars = Qnil;
1159
1160   DEFVAR_LISP ("unicode-category-table", &Vunicode_category_table,
1161                doc: /* Char table of Unicode's "General Category".
1162 All Unicode characters have one of the following values (symbol):
1163   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1164   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1165 See The Unicode Standard for the meaning of those values.  */);
1166   /* The correct char-table is setup in characters.el.  */
1167   Vunicode_category_table = Qnil;
1168 }
1169
1170 #endif /* emacs */
1171
1172 /* arch-tag: b6665960-3c3d-4184-85cd-af4318197999
1173    (do not change this comment) */