src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #include <stdio.h>
  33
  34 #ifdef emacs
  35
  36 #include <sys/types.h>
  37 #include <setjmp.h>
  38 #include <intprops.h>
  39 #include "lisp.h"
  40 #include "character.h"
  41 #include "buffer.h"
  42 #include "charset.h"
  43 #include "composite.h"
  44 #include "disptab.h"
  45
  46 #else  /* not emacs */
  47
  48 #include "mulelib.h"
  49
  50 #endif /* emacs */
  51
  52 Lisp_Object Qcharacterp;
  53
  54 static Lisp_Object Qauto_fill_chars;
  55
  56 /* Char-table of information about which character to unify to which
  57    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  58 Lisp_Object Vchar_unify_table;
  59
  60 /* Variable used locally in the macro FETCH_MULTIBYTE_CHAR.  */
  61 unsigned char *_fetch_multibyte_char_p;
  62
  63 static Lisp_Object Qchar_script_table;
  64
  65 \f
  66
  67 /* If character code C has modifier masks, reflect them to the
  68    character code if possible.  Return the resulting code.  */
  69
  70 EMACS_INT
  71 char_resolve_modifier_mask (EMACS_INT c)
  72 {
  73   /* A non-ASCII character can't reflect modifier bits to the code.  */
  74   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  75     return c;
  76
  77   /* For Meta, Shift, and Control modifiers, we need special care.  */
  78   if (c & CHAR_SHIFT)
  79     {
  80       /* Shift modifier is valid only with [A-Za-z].  */
  81       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  82         c &= ~CHAR_SHIFT;
  83       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  84         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  85       /* Shift modifier for control characters and SPC is ignored.  */
  86       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  87         c &= ~CHAR_SHIFT;
  88     }
  89   if (c & CHAR_CTL)
  90     {
  91       /* Simulate the code in lread.c.  */
  92       /* Allow `\C- ' and `\C-?'.  */
  93       if ((c & 0377) == ' ')
  94         c &= ~0177 & ~ CHAR_CTL;
  95       else if ((c & 0377) == '?')
  96         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  97       /* ASCII control chars are made from letters (both cases),
  98          as well as the non-letters within 0100...0137.  */
  99       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
 100         c &= (037 | (~0177 & ~CHAR_CTL));
 101       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 102         c &= (037 | (~0177 & ~CHAR_CTL));
 103     }
 104 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 105   if (c & CHAR_META)
 106     {
 107       /* Move the meta bit to the right place for a string.  */
 108       c = (c & ~CHAR_META) | 0x80;
 109     }
 110 #endif
 111
 112   return c;
 113 }
 114
 115
 116 /* Store multibyte form of character C at P.  If C has modifier bits,
 117    handle them appropriately.  */
 118
 119 int
 120 char_string (unsigned int c, unsigned char *p)
 121 {
 122   int bytes;
 123
 124   if (c & CHAR_MODIFIER_MASK)
 125     {
 126       c = char_resolve_modifier_mask (c);
 127       /* If C still has any modifier bits, just ignore it.  */
 128       c &= ~CHAR_MODIFIER_MASK;
 129     }
 130
 131   MAYBE_UNIFY_CHAR (c);
 132
 133   if (c <= MAX_3_BYTE_CHAR)
 134     {
 135       bytes = CHAR_STRING (c, p);
 136     }
 137   else if (c <= MAX_4_BYTE_CHAR)
 138     {
 139       p[0] = (0xF0 | (c >> 18));
 140       p[1] = (0x80 | ((c >> 12) & 0x3F));
 141       p[2] = (0x80 | ((c >> 6) & 0x3F));
 142       p[3] = (0x80 | (c & 0x3F));
 143       bytes = 4;
 144     }
 145   else if (c <= MAX_5_BYTE_CHAR)
 146     {
 147       p[0] = 0xF8;
 148       p[1] = (0x80 | ((c >> 18) & 0x0F));
 149       p[2] = (0x80 | ((c >> 12) & 0x3F));
 150       p[3] = (0x80 | ((c >> 6) & 0x3F));
 151       p[4] = (0x80 | (c & 0x3F));
 152       bytes = 5;
 153     }
 154   else if (c <= MAX_CHAR)
 155     {
 156       c = CHAR_TO_BYTE8 (c);
 157       bytes = BYTE8_STRING (c, p);
 158     }
 159   else
 160     error ("Invalid character: %x", c);
 161
 162   return bytes;
 163 }
 164
 165
 166 /* Return a character whose multibyte form is at P.  If LEN is not
 167    NULL, it must be a pointer to integer.  In that case, set *LEN to
 168    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 169    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 170    the ending address (i.e., the starting address of the next
 171    character) of the multibyte form.  */
 172
 173 int
 174 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 175 {
 176   int c;
 177   const unsigned char *saved_p = p;
 178
 179   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 180     {
 181       c = STRING_CHAR_ADVANCE (p);
 182     }
 183   else if (! (*p & 0x08))
 184     {
 185       c = ((((p)[0] & 0xF) << 18)
 186            | (((p)[1] & 0x3F) << 12)
 187            | (((p)[2] & 0x3F) << 6)
 188            | ((p)[3] & 0x3F));
 189       p += 4;
 190     }
 191   else
 192     {
 193       c = ((((p)[1] & 0x3F) << 18)
 194            | (((p)[2] & 0x3F) << 12)
 195            | (((p)[3] & 0x3F) << 6)
 196            | ((p)[4] & 0x3F));
 197       p += 5;
 198     }
 199
 200   MAYBE_UNIFY_CHAR (c);
 201
 202   if (len)
 203     *len = p - saved_p;
 204   if (advanced)
 205     *advanced = p;
 206   return c;
 207 }
 208
 209
 210 /* Translate character C by translation table TABLE.  If no translation is
 211    found in TABLE, return the untranslated character.  If TABLE is a list,
 212    elements are char tables.  In that case, recursively translate C by all the
 213    tables in the list.  */
 214
 215 int
 216 translate_char (Lisp_Object table, int c)
 217 {
 218   if (CHAR_TABLE_P (table))
 219     {
 220       Lisp_Object ch;
 221
 222       ch = CHAR_TABLE_REF (table, c);
 223       if (CHARACTERP (ch))
 224         c = XINT (ch);
 225     }
 226   else
 227     {
 228       for (; CONSP (table); table = XCDR (table))
 229         c = translate_char (XCAR (table), c);
 230     }
 231   return c;
 232 }
 233
 234 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 235    them, return (C & 0xFF).  */
 236
 237 int
 238 multibyte_char_to_unibyte (int c)
 239 {
 240   if (c < 0x80)
 241     return c;
 242   if (CHAR_BYTE8_P (c))
 243     return CHAR_TO_BYTE8 (c);
 244   return (c & 0xFF);
 245 }
 246
 247 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 248    by charset_unibyte.  */
 249
 250 int
 251 multibyte_char_to_unibyte_safe (int c)
 252 {
 253   if (c < 0x80)
 254     return c;
 255   if (CHAR_BYTE8_P (c))
 256     return CHAR_TO_BYTE8 (c);
 257   return -1;
 258 }
 259
 260 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 261        doc: /* Return non-nil if OBJECT is a character.
 262 usage: (characterp OBJECT)  */)
 263   (Lisp_Object object, Lisp_Object ignore)
 264 {
 265   return (CHARACTERP (object) ? Qt : Qnil);
 266 }
 267
 268 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 269        doc: /* Return the character of the maximum code.  */)
 270   (void)
 271 {
 272   return make_number (MAX_CHAR);
 273 }
 274
 275 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 276        Sunibyte_char_to_multibyte, 1, 1, 0,
 277        doc: /* Convert the byte CH to multibyte character.  */)
 278   (Lisp_Object ch)
 279 {
 280   int c;
 281
 282   CHECK_CHARACTER (ch);
 283   c = XFASTINT (ch);
 284   if (c >= 0x100)
 285     error ("Not a unibyte character: %d", c);
 286   MAKE_CHAR_MULTIBYTE (c);
 287   return make_number (c);
 288 }
 289
 290 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 291        Smultibyte_char_to_unibyte, 1, 1, 0,
 292        doc: /* Convert the multibyte character CH to a byte.
 293 If the multibyte character does not represent a byte, return -1.  */)
 294   (Lisp_Object ch)
 295 {
 296   int cm;
 297
 298   CHECK_CHARACTER (ch);
 299   cm = XFASTINT (ch);
 300   if (cm < 256)
 301     /* Can't distinguish a byte read from a unibyte buffer from
 302        a latin1 char, so let's let it slide.  */
 303     return ch;
 304   else
 305     {
 306       int cu = CHAR_TO_BYTE_SAFE (cm);
 307       return make_number (cu);
 308     }
 309 }
 310
 311 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 312        doc: /* Return width of CHAR when displayed in the current buffer.
 313 The width is measured by how many columns it occupies on the screen.
 314 Tab is taken to occupy `tab-width' columns.
 315 usage: (char-width CHAR)  */)
 316   (Lisp_Object ch)
 317 {
 318   Lisp_Object disp;
 319   int c, width;
 320   struct Lisp_Char_Table *dp = buffer_display_table ();
 321
 322   CHECK_CHARACTER (ch);
 323   c = XINT (ch);
 324
 325   /* Get the way the display table would display it.  */
 326   disp = dp ? DISP_CHAR_VECTOR (dp, c) : Qnil;
 327
 328   if (VECTORP (disp))
 329     width = sanitize_char_width (ASIZE (disp));
 330   else
 331     width = CHAR_WIDTH (c);
 332
 333   return make_number (width);
 334 }
 335
 336 /* Return width of string STR of length LEN when displayed in the
 337    current buffer.  The width is measured by how many columns it
 338    occupies on the screen.  If PRECISION > 0, return the width of
 339    longest substring that doesn't exceed PRECISION, and set number of
 340    characters and bytes of the substring in *NCHARS and *NBYTES
 341    respectively.  */
 342
 343 ptrdiff_t
 344 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 345                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 346 {
 347   ptrdiff_t i = 0, i_byte = 0;
 348   ptrdiff_t width = 0;
 349   struct Lisp_Char_Table *dp = buffer_display_table ();
 350
 351   while (i_byte < len)
 352     {
 353       int bytes, thiswidth;
 354       Lisp_Object val;
 355       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 356
 357       if (dp)
 358         {
 359           val = DISP_CHAR_VECTOR (dp, c);
 360           if (VECTORP (val))
 361             thiswidth = sanitize_char_width (ASIZE (val));
 362           else
 363             thiswidth = CHAR_WIDTH (c);
 364         }
 365       else
 366         {
 367           thiswidth = CHAR_WIDTH (c);
 368         }
 369
 370       if (precision > 0
 371           && (width + thiswidth > precision))
 372         {
 373           *nchars = i;
 374           *nbytes = i_byte;
 375           return width;
 376         }
 377       i++;
 378       i_byte += bytes;
 379       width += thiswidth;
 380   }
 381
 382   if (precision > 0)
 383     {
 384       *nchars = i;
 385       *nbytes = i_byte;
 386     }
 387
 388   return width;
 389 }
 390
 391 /* Return width of string STR of length LEN when displayed in the
 392    current buffer.  The width is measured by how many columns it
 393    occupies on the screen.  */
 394
 395 ptrdiff_t
 396 strwidth (const char *str, ptrdiff_t len)
 397 {
 398   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 399 }
 400
 401 /* Return width of Lisp string STRING when displayed in the current
 402    buffer.  The width is measured by how many columns it occupies on
 403    the screen while paying attention to compositions.  If PRECISION >
 404    0, return the width of longest substring that doesn't exceed
 405    PRECISION, and set number of characters and bytes of the substring
 406    in *NCHARS and *NBYTES respectively.  */
 407
 408 ptrdiff_t
 409 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 410                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 411 {
 412   ptrdiff_t len = SCHARS (string);
 413   /* This set multibyte to 0 even if STRING is multibyte when it
 414      contains only ascii and eight-bit-graphic, but that's
 415      intentional.  */
 416   int multibyte = len < SBYTES (string);
 417   unsigned char *str = SDATA (string);
 418   ptrdiff_t i = 0, i_byte = 0;
 419   ptrdiff_t width = 0;
 420   struct Lisp_Char_Table *dp = buffer_display_table ();
 421
 422   while (i < len)
 423     {
 424       ptrdiff_t chars, bytes, thiswidth;
 425       Lisp_Object val;
 426       ptrdiff_t cmp_id;
 427       ptrdiff_t ignore, end;
 428
 429       if (find_composition (i, -1, &ignore, &end, &val, string)
 430           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 431               >= 0))
 432         {
 433           thiswidth = composition_table[cmp_id]->width;
 434           chars = end - i;
 435           bytes = string_char_to_byte (string, end) - i_byte;
 436         }
 437       else
 438         {
 439           int c;
 440
 441           if (multibyte)
 442             {
 443               int cbytes;
 444               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 445               bytes = cbytes;
 446             }
 447           else
 448             c = str[i_byte], bytes = 1;
 449           chars = 1;
 450           if (dp)
 451             {
 452               val = DISP_CHAR_VECTOR (dp, c);
 453               if (VECTORP (val))
 454                 thiswidth = sanitize_char_width (ASIZE (val));
 455               else
 456                 thiswidth = CHAR_WIDTH (c);
 457             }
 458           else
 459             {
 460               thiswidth = CHAR_WIDTH (c);
 461             }
 462         }
 463
 464       if (precision <= 0)
 465         {
 466 #ifdef emacs
 467           if (INT_ADD_OVERFLOW (width, thiswidth))
 468             string_overflow ();
 469 #endif
 470         }
 471       else if (precision - width < thiswidth)
 472         {
 473           *nchars = i;
 474           *nbytes = i_byte;
 475           return width;
 476         }
 477       i += chars;
 478       i_byte += bytes;
 479       width += thiswidth;
 480     }
 481
 482   if (precision > 0)
 483     {
 484       *nchars = i;
 485       *nbytes = i_byte;
 486     }
 487
 488   return width;
 489 }
 490
 491 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 492        doc: /* Return width of STRING when displayed in the current buffer.
 493 Width is measured by how many columns it occupies on the screen.
 494 When calculating width of a multibyte character in STRING,
 495 only the base leading-code is considered; the validity of
 496 the following bytes is not checked.  Tabs in STRING are always
 497 taken to occupy `tab-width' columns.
 498 usage: (string-width STRING)  */)
 499   (Lisp_Object str)
 500 {
 501   Lisp_Object val;
 502
 503   CHECK_STRING (str);
 504   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 505   return val;
 506 }
 507
 508 /* Return the number of characters in the NBYTES bytes at PTR.
 509    This works by looking at the contents and checking for multibyte
 510    sequences while assuming that there's no invalid sequence.
 511    However, if the current buffer has enable-multibyte-characters =
 512    nil, we treat each byte as a character.  */
 513
 514 ptrdiff_t
 515 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 516 {
 517   /* current_buffer is null at early stages of Emacs initialization.  */
 518   if (current_buffer == 0
 519       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 520     return nbytes;
 521
 522   return multibyte_chars_in_text (ptr, nbytes);
 523 }
 524
 525 /* Return the number of characters in the NBYTES bytes at PTR.
 526    This works by looking at the contents and checking for multibyte
 527    sequences while assuming that there's no invalid sequence.  It
 528    ignores enable-multibyte-characters.  */
 529
 530 ptrdiff_t
 531 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 532 {
 533   const unsigned char *endp = ptr + nbytes;
 534   ptrdiff_t chars = 0;
 535
 536   while (ptr < endp)
 537     {
 538       int len = MULTIBYTE_LENGTH (ptr, endp);
 539
 540       if (len == 0)
 541         abort ();
 542       ptr += len;
 543       chars++;
 544     }
 545
 546   return chars;
 547 }
 548
 549 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 550    characters and bytes in it, and store them in *NCHARS and *NBYTES
 551    respectively.  On counting bytes, pay attention to that 8-bit
 552    characters not constructing a valid multibyte sequence are
 553    represented by 2-byte in a multibyte text.  */
 554
 555 void
 556 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 557                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 558 {
 559   const unsigned char *endp = str + len;
 560   int n;
 561   ptrdiff_t chars = 0, bytes = 0;
 562
 563   if (len >= MAX_MULTIBYTE_LENGTH)
 564     {
 565       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 566       while (str < adjusted_endp)
 567         {
 568           if (! CHAR_BYTE8_HEAD_P (*str)
 569               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 570             str += n, bytes += n;
 571           else
 572             str++, bytes += 2;
 573           chars++;
 574         }
 575     }
 576   while (str < endp)
 577     {
 578       if (! CHAR_BYTE8_HEAD_P (*str)
 579           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 580         str += n, bytes += n;
 581       else
 582         str++, bytes += 2;
 583       chars++;
 584     }
 585
 586   *nchars = chars;
 587   *nbytes = bytes;
 588   return;
 589 }
 590
 591 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 592    It actually converts only such 8-bit characters that don't construct
 593    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 594    NCHARS is nonzero, set *NCHARS to the number of characters in the
 595    text.  It is assured that we can use LEN bytes at STR as a work
 596    area and that is enough.  Return the number of bytes of the
 597    resulting text.  */
 598
 599 ptrdiff_t
 600 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 601                   ptrdiff_t *nchars)
 602 {
 603   unsigned char *p = str, *endp = str + nbytes;
 604   unsigned char *to;
 605   ptrdiff_t chars = 0;
 606   int n;
 607
 608   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 609     {
 610       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 611       while (p < adjusted_endp
 612              && ! CHAR_BYTE8_HEAD_P (*p)
 613              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 614         p += n, chars++;
 615     }
 616   while (p < endp
 617          && ! CHAR_BYTE8_HEAD_P (*p)
 618          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 619     p += n, chars++;
 620   if (nchars)
 621     *nchars = chars;
 622   if (p == endp)
 623     return nbytes;
 624
 625   to = p;
 626   nbytes = endp - p;
 627   endp = str + len;
 628   memmove (endp - nbytes, p, nbytes);
 629   p = endp - nbytes;
 630
 631   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 632     {
 633       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 634       while (p < adjusted_endp)
 635         {
 636           if (! CHAR_BYTE8_HEAD_P (*p)
 637               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 638             {
 639               while (n--)
 640                 *to++ = *p++;
 641             }
 642           else
 643             {
 644               int c = *p++;
 645               c = BYTE8_TO_CHAR (c);
 646               to += CHAR_STRING (c, to);
 647             }
 648         }
 649       chars++;
 650     }
 651   while (p < endp)
 652     {
 653       if (! CHAR_BYTE8_HEAD_P (*p)
 654           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 655         {
 656           while (n--)
 657             *to++ = *p++;
 658         }
 659       else
 660         {
 661           int c = *p++;
 662           c = BYTE8_TO_CHAR (c);
 663           to += CHAR_STRING (c, to);
 664         }
 665       chars++;
 666     }
 667   if (nchars)
 668     *nchars = chars;
 669   return (to - str);
 670 }
 671
 672 /* Parse unibyte string at STR of LEN bytes, and return the number of
 673    bytes it may occupy when converted to multibyte string by
 674    `str_to_multibyte'.  */
 675
 676 ptrdiff_t
 677 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 678 {
 679   const unsigned char *endp = str + len;
 680   ptrdiff_t bytes;
 681
 682   for (bytes = 0; str < endp; str++)
 683     {
 684       int n = *str < 0x80 ? 1 : 2;
 685       if (INT_ADD_OVERFLOW (bytes, n))
 686         string_overflow ();
 687       bytes += n;
 688     }
 689   return bytes;
 690 }
 691
 692
 693 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 694    that contains the same single-byte characters.  It actually
 695    converts all 8-bit characters to multibyte forms.  It is assured
 696    that we can use LEN bytes at STR as a work area and that is
 697    enough.  */
 698
 699 ptrdiff_t
 700 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 701 {
 702   unsigned char *p = str, *endp = str + bytes;
 703   unsigned char *to;
 704
 705   while (p < endp && *p < 0x80) p++;
 706   if (p == endp)
 707     return bytes;
 708   to = p;
 709   bytes = endp - p;
 710   endp = str + len;
 711   memmove (endp - bytes, p, bytes);
 712   p = endp - bytes;
 713   while (p < endp)
 714     {
 715       int c = *p++;
 716
 717       if (c >= 0x80)
 718         c = BYTE8_TO_CHAR (c);
 719       to += CHAR_STRING (c, to);
 720     }
 721   return (to - str);
 722 }
 723
 724 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 725    actually converts characters in the range 0x80..0xFF to
 726    unibyte.  */
 727
 728 ptrdiff_t
 729 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 730 {
 731   const unsigned char *p = str, *endp = str + bytes;
 732   unsigned char *to;
 733   int c, len;
 734
 735   while (p < endp)
 736     {
 737       c = *p;
 738       len = BYTES_BY_CHAR_HEAD (c);
 739       if (CHAR_BYTE8_HEAD_P (c))
 740         break;
 741       p += len;
 742     }
 743   to = str + (p - str);
 744   while (p < endp)
 745     {
 746       c = *p;
 747       len = BYTES_BY_CHAR_HEAD (c);
 748       if (CHAR_BYTE8_HEAD_P (c))
 749         {
 750           c = STRING_CHAR_ADVANCE (p);
 751           *to++ = CHAR_TO_BYTE8 (c);
 752         }
 753       else
 754         {
 755           while (len--) *to++ = *p++;
 756         }
 757     }
 758   return (to - str);
 759 }
 760
 761 /* Convert eight-bit chars in SRC (in multibyte form) to the
 762    corresponding byte and store in DST.  CHARS is the number of
 763    characters in SRC.  The value is the number of bytes stored in DST.
 764    Usually, the value is the same as CHARS, but is less than it if SRC
 765    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 766    is nonzero, a Latin-1 character is accepted and converted to a byte
 767    of that character code.
 768    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 769
 770 ptrdiff_t
 771 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars, int accept_latin_1)
 772 {
 773   ptrdiff_t i;
 774
 775   for (i = 0; i < chars; i++)
 776     {
 777       int c = STRING_CHAR_ADVANCE (src);
 778
 779       if (CHAR_BYTE8_P (c))
 780         c = CHAR_TO_BYTE8 (c);
 781       else if (! ASCII_CHAR_P (c)
 782                && (! accept_latin_1 || c >= 0x100))
 783         return i;
 784       *dst++ = c;
 785     }
 786   return i;
 787 }
 788
 789
 790 static ptrdiff_t
 791 string_count_byte8 (Lisp_Object string)
 792 {
 793   int multibyte = STRING_MULTIBYTE (string);
 794   ptrdiff_t nbytes = SBYTES (string);
 795   unsigned char *p = SDATA (string);
 796   unsigned char *pend = p + nbytes;
 797   ptrdiff_t count = 0;
 798   int c, len;
 799
 800   if (multibyte)
 801     while (p < pend)
 802       {
 803         c = *p;
 804         len = BYTES_BY_CHAR_HEAD (c);
 805
 806         if (CHAR_BYTE8_HEAD_P (c))
 807           count++;
 808         p += len;
 809       }
 810   else
 811     while (p < pend)
 812       {
 813         if (*p++ >= 0x80)
 814           count++;
 815       }
 816   return count;
 817 }
 818
 819
 820 Lisp_Object
 821 string_escape_byte8 (Lisp_Object string)
 822 {
 823   ptrdiff_t nchars = SCHARS (string);
 824   ptrdiff_t nbytes = SBYTES (string);
 825   int multibyte = STRING_MULTIBYTE (string);
 826   ptrdiff_t byte8_count;
 827   const unsigned char *src, *src_end;
 828   unsigned char *dst;
 829   Lisp_Object val;
 830   int c, len;
 831
 832   if (multibyte && nchars == nbytes)
 833     return string;
 834
 835   byte8_count = string_count_byte8 (string);
 836
 837   if (byte8_count == 0)
 838     return string;
 839
 840   if (multibyte)
 841     {
 842       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 843           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 844         string_overflow ();
 845
 846       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 847       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 848                                           nbytes + byte8_count * 2);
 849     }
 850   else
 851     {
 852       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 853         string_overflow ();
 854
 855       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 856       val = make_uninit_string (nbytes + byte8_count * 3);
 857     }
 858
 859   src = SDATA (string);
 860   src_end = src + nbytes;
 861   dst = SDATA (val);
 862   if (multibyte)
 863     while (src < src_end)
 864       {
 865         c = *src;
 866         len = BYTES_BY_CHAR_HEAD (c);
 867
 868         if (CHAR_BYTE8_HEAD_P (c))
 869           {
 870             c = STRING_CHAR_ADVANCE (src);
 871             c = CHAR_TO_BYTE8 (c);
 872             sprintf ((char *) dst, "\\%03o", c);
 873             dst += 4;
 874           }
 875         else
 876           while (len--) *dst++ = *src++;
 877       }
 878   else
 879     while (src < src_end)
 880       {
 881         c = *src++;
 882         if (c >= 0x80)
 883           {
 884             sprintf ((char *) dst, "\\%03o", c);
 885             dst += 4;
 886           }
 887         else
 888           *dst++ = c;
 889       }
 890   return val;
 891 }
 892
 893 \f
 894 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 895        doc: /*
 896 Concatenate all the argument characters and make the result a string.
 897 usage: (string &rest CHARACTERS)  */)
 898   (ptrdiff_t n, Lisp_Object *args)
 899 {
 900   ptrdiff_t i;
 901   int c;
 902   unsigned char *buf, *p;
 903   Lisp_Object str;
 904   USE_SAFE_ALLOCA;
 905
 906   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 907   p = buf;
 908
 909   for (i = 0; i < n; i++)
 910     {
 911       CHECK_CHARACTER (args[i]);
 912       c = XINT (args[i]);
 913       p += CHAR_STRING (c, p);
 914     }
 915
 916   str = make_string_from_bytes ((char *) buf, n, p - buf);
 917   SAFE_FREE ();
 918   return str;
 919 }
 920
 921 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 922        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 923 usage: (unibyte-string &rest BYTES)  */)
 924   (ptrdiff_t n, Lisp_Object *args)
 925 {
 926   ptrdiff_t i;
 927   unsigned char *buf, *p;
 928   Lisp_Object str;
 929   USE_SAFE_ALLOCA;
 930
 931   SAFE_ALLOCA (buf, unsigned char *, n);
 932   p = buf;
 933
 934   for (i = 0; i < n; i++)
 935     {
 936       CHECK_RANGED_INTEGER (0, args[i], 255);
 937       *p++ = XINT (args[i]);
 938     }
 939
 940   str = make_string_from_bytes ((char *) buf, n, p - buf);
 941   SAFE_FREE ();
 942   return str;
 943 }
 944
 945 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 946        Schar_resolve_modifiers, 1, 1, 0,
 947        doc: /* Resolve modifiers in the character CHAR.
 948 The value is a character with modifiers resolved into the character
 949 code.  Unresolved modifiers are kept in the value.
 950 usage: (char-resolve-modifiers CHAR)  */)
 951   (Lisp_Object character)
 952 {
 953   EMACS_INT c;
 954
 955   CHECK_NUMBER (character);
 956   c = XINT (character);
 957   return make_number (char_resolve_modifier_mask (c));
 958 }
 959
 960 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 961        doc: /* Return a byte value of a character at point.
 962 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 963 a byte value.
 964 Optional 2nd arg STRING, if non-nil, is a string of which first
 965 character is a target to get a byte value.  In this case, POSITION, if
 966 non-nil, is an index of a target character in the string.
 967
 968 If the current buffer (or STRING) is multibyte, and the target
 969 character is not ASCII nor 8-bit character, an error is signaled.  */)
 970   (Lisp_Object position, Lisp_Object string)
 971 {
 972   int c;
 973   ptrdiff_t pos;
 974   unsigned char *p;
 975
 976   if (NILP (string))
 977     {
 978       if (NILP (position))
 979         {
 980           p = PT_ADDR;
 981         }
 982       else
 983         {
 984           CHECK_NUMBER_COERCE_MARKER (position);
 985           if (XINT (position) < BEGV || XINT (position) >= ZV)
 986             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 987           pos = XFASTINT (position);
 988           p = CHAR_POS_ADDR (pos);
 989         }
 990       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 991         return make_number (*p);
 992     }
 993   else
 994     {
 995       CHECK_STRING (string);
 996       if (NILP (position))
 997         {
 998           p = SDATA (string);
 999         }
1000       else
1001         {
1002           CHECK_NATNUM (position);
1003           if (XINT (position) >= SCHARS (string))
1004             args_out_of_range (string, position);
1005           pos = XFASTINT (position);
1006           p = SDATA (string) + string_char_to_byte (string, pos);
1007         }
1008       if (! STRING_MULTIBYTE (string))
1009         return make_number (*p);
1010     }
1011   c = STRING_CHAR (p);
1012   if (CHAR_BYTE8_P (c))
1013     c = CHAR_TO_BYTE8 (c);
1014   else if (! ASCII_CHAR_P (c))
1015     error ("Not an ASCII nor an 8-bit character: %d", c);
1016   return make_number (c);
1017 }
1018
1019
1020 void
1021 init_character_once (void)
1022 {
1023 }
1024
1025 #ifdef emacs
1026
1027 void
1028 syms_of_character (void)
1029 {
1030   DEFSYM (Qcharacterp, "characterp");
1031   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1032
1033   staticpro (&Vchar_unify_table);
1034   Vchar_unify_table = Qnil;
1035
1036   defsubr (&Smax_char);
1037   defsubr (&Scharacterp);
1038   defsubr (&Sunibyte_char_to_multibyte);
1039   defsubr (&Smultibyte_char_to_unibyte);
1040   defsubr (&Schar_width);
1041   defsubr (&Sstring_width);
1042   defsubr (&Sstring);
1043   defsubr (&Sunibyte_string);
1044   defsubr (&Schar_resolve_modifiers);
1045   defsubr (&Sget_byte);
1046
1047   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1048                doc: /*
1049 Vector recording all translation tables ever defined.
1050 Each element is a pair (SYMBOL . TABLE) relating the table to the
1051 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1052   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1053
1054   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1055                doc: /*
1056 A char-table for characters which invoke auto-filling.
1057 Such characters have value t in this table.  */);
1058   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1059   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1060   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1061
1062   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1063                doc: /*
1064 A char-table for width (columns) of each character.  */);
1065   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1066   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1067   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1068                         make_number (4));
1069
1070   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1071                doc: /* A char-table for each printable character.  */);
1072   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1073   Fset_char_table_range (Vprintable_chars,
1074                          Fcons (make_number (32), make_number (126)), Qt);
1075   Fset_char_table_range (Vprintable_chars,
1076                          Fcons (make_number (160),
1077                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1078
1079   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1080                doc: /* Char table of script symbols.
1081 It has one extra slot whose value is a list of script symbols.  */);
1082
1083   /* Intern this now in case it isn't already done.
1084      Setting this variable twice is harmless.
1085      But don't staticpro it here--that is done in alloc.c.  */
1086   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1087   DEFSYM (Qchar_script_table, "char-script-table");
1088   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1089   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1090
1091   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1092                doc: /* Alist of scripts vs the representative characters.
1093 Each element is a cons (SCRIPT . CHARS).
1094 SCRIPT is a symbol representing a script or a subgroup of a script.
1095 CHARS is a list or a vector of characters.
1096 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1097 If it is a vector, one of the characters in the vector is necessary.
1098 This variable is used to find a font for a specific script.  */);
1099   Vscript_representative_chars = Qnil;
1100
1101   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1102                doc: /* Char table of Unicode's "General Category".
1103 All Unicode characters have one of the following values (symbol):
1104   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1105   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1106 See The Unicode Standard for the meaning of those values.  */);
1107   /* The correct char-table is setup in characters.el.  */
1108   Vunicode_category_table = Qnil;
1109 }
1110
1111 #endif /* emacs */