src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #define CHARACTER_INLINE EXTERN_INLINE
  33
  34 #include <stdio.h>
  35
  36 #ifdef emacs
  37
  38 #include <sys/types.h>
  39 #include <setjmp.h>
  40 #include <intprops.h>
  41 #include "lisp.h"
  42 #include "character.h"
  43 #include "buffer.h"
  44 #include "charset.h"
  45 #include "composite.h"
  46 #include "disptab.h"
  47
  48 #else  /* not emacs */
  49
  50 #include "mulelib.h"
  51
  52 #endif /* emacs */
  53
  54 Lisp_Object Qcharacterp;
  55
  56 static Lisp_Object Qauto_fill_chars;
  57
  58 /* Char-table of information about which character to unify to which
  59    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  60 Lisp_Object Vchar_unify_table;
  61
  62 static Lisp_Object Qchar_script_table;
  63
  64 \f
  65
  66 /* If character code C has modifier masks, reflect them to the
  67    character code if possible.  Return the resulting code.  */
  68
  69 EMACS_INT
  70 char_resolve_modifier_mask (EMACS_INT c)
  71 {
  72   /* A non-ASCII character can't reflect modifier bits to the code.  */
  73   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  74     return c;
  75
  76   /* For Meta, Shift, and Control modifiers, we need special care.  */
  77   if (c & CHAR_SHIFT)
  78     {
  79       /* Shift modifier is valid only with [A-Za-z].  */
  80       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  81         c &= ~CHAR_SHIFT;
  82       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  83         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  84       /* Shift modifier for control characters and SPC is ignored.  */
  85       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  86         c &= ~CHAR_SHIFT;
  87     }
  88   if (c & CHAR_CTL)
  89     {
  90       /* Simulate the code in lread.c.  */
  91       /* Allow `\C- ' and `\C-?'.  */
  92       if ((c & 0377) == ' ')
  93         c &= ~0177 & ~ CHAR_CTL;
  94       else if ((c & 0377) == '?')
  95         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  96       /* ASCII control chars are made from letters (both cases),
  97          as well as the non-letters within 0100...0137.  */
  98       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  99         c &= (037 | (~0177 & ~CHAR_CTL));
 100       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 101         c &= (037 | (~0177 & ~CHAR_CTL));
 102     }
 103 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 104   if (c & CHAR_META)
 105     {
 106       /* Move the meta bit to the right place for a string.  */
 107       c = (c & ~CHAR_META) | 0x80;
 108     }
 109 #endif
 110
 111   return c;
 112 }
 113
 114
 115 /* Store multibyte form of character C at P.  If C has modifier bits,
 116    handle them appropriately.  */
 117
 118 int
 119 char_string (unsigned int c, unsigned char *p)
 120 {
 121   int bytes;
 122
 123   if (c & CHAR_MODIFIER_MASK)
 124     {
 125       c = char_resolve_modifier_mask (c);
 126       /* If C still has any modifier bits, just ignore it.  */
 127       c &= ~CHAR_MODIFIER_MASK;
 128     }
 129
 130   MAYBE_UNIFY_CHAR (c);
 131
 132   if (c <= MAX_3_BYTE_CHAR)
 133     {
 134       bytes = CHAR_STRING (c, p);
 135     }
 136   else if (c <= MAX_4_BYTE_CHAR)
 137     {
 138       p[0] = (0xF0 | (c >> 18));
 139       p[1] = (0x80 | ((c >> 12) & 0x3F));
 140       p[2] = (0x80 | ((c >> 6) & 0x3F));
 141       p[3] = (0x80 | (c & 0x3F));
 142       bytes = 4;
 143     }
 144   else if (c <= MAX_5_BYTE_CHAR)
 145     {
 146       p[0] = 0xF8;
 147       p[1] = (0x80 | ((c >> 18) & 0x0F));
 148       p[2] = (0x80 | ((c >> 12) & 0x3F));
 149       p[3] = (0x80 | ((c >> 6) & 0x3F));
 150       p[4] = (0x80 | (c & 0x3F));
 151       bytes = 5;
 152     }
 153   else if (c <= MAX_CHAR)
 154     {
 155       c = CHAR_TO_BYTE8 (c);
 156       bytes = BYTE8_STRING (c, p);
 157     }
 158   else
 159     error ("Invalid character: %x", c);
 160
 161   return bytes;
 162 }
 163
 164
 165 /* Return a character whose multibyte form is at P.  If LEN is not
 166    NULL, it must be a pointer to integer.  In that case, set *LEN to
 167    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 168    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 169    the ending address (i.e., the starting address of the next
 170    character) of the multibyte form.  */
 171
 172 int
 173 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 174 {
 175   int c;
 176   const unsigned char *saved_p = p;
 177
 178   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 179     {
 180       c = STRING_CHAR_ADVANCE (p);
 181     }
 182   else if (! (*p & 0x08))
 183     {
 184       c = ((((p)[0] & 0xF) << 18)
 185            | (((p)[1] & 0x3F) << 12)
 186            | (((p)[2] & 0x3F) << 6)
 187            | ((p)[3] & 0x3F));
 188       p += 4;
 189     }
 190   else
 191     {
 192       c = ((((p)[1] & 0x3F) << 18)
 193            | (((p)[2] & 0x3F) << 12)
 194            | (((p)[3] & 0x3F) << 6)
 195            | ((p)[4] & 0x3F));
 196       p += 5;
 197     }
 198
 199   MAYBE_UNIFY_CHAR (c);
 200
 201   if (len)
 202     *len = p - saved_p;
 203   if (advanced)
 204     *advanced = p;
 205   return c;
 206 }
 207
 208
 209 /* Translate character C by translation table TABLE.  If no translation is
 210    found in TABLE, return the untranslated character.  If TABLE is a list,
 211    elements are char tables.  In that case, recursively translate C by all the
 212    tables in the list.  */
 213
 214 int
 215 translate_char (Lisp_Object table, int c)
 216 {
 217   if (CHAR_TABLE_P (table))
 218     {
 219       Lisp_Object ch;
 220
 221       ch = CHAR_TABLE_REF (table, c);
 222       if (CHARACTERP (ch))
 223         c = XINT (ch);
 224     }
 225   else
 226     {
 227       for (; CONSP (table); table = XCDR (table))
 228         c = translate_char (XCAR (table), c);
 229     }
 230   return c;
 231 }
 232
 233 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 234    them, return (C & 0xFF).  */
 235
 236 int
 237 multibyte_char_to_unibyte (int c)
 238 {
 239   if (c < 0x80)
 240     return c;
 241   if (CHAR_BYTE8_P (c))
 242     return CHAR_TO_BYTE8 (c);
 243   return (c & 0xFF);
 244 }
 245
 246 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 247    by charset_unibyte.  */
 248
 249 int
 250 multibyte_char_to_unibyte_safe (int c)
 251 {
 252   if (c < 0x80)
 253     return c;
 254   if (CHAR_BYTE8_P (c))
 255     return CHAR_TO_BYTE8 (c);
 256   return -1;
 257 }
 258
 259 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 260        doc: /* Return non-nil if OBJECT is a character.
 261 In Emacs Lisp, characters are represented by character codes, which
 262 are non-negative integers.  The function `max-char' returns the
 263 maximum character code.
 264 usage: (characterp OBJECT)  */)
 265   (Lisp_Object object, Lisp_Object ignore)
 266 {
 267   return (CHARACTERP (object) ? Qt : Qnil);
 268 }
 269
 270 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 271        doc: /* Return the character of the maximum code.  */)
 272   (void)
 273 {
 274   return make_number (MAX_CHAR);
 275 }
 276
 277 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 278        Sunibyte_char_to_multibyte, 1, 1, 0,
 279        doc: /* Convert the byte CH to multibyte character.  */)
 280   (Lisp_Object ch)
 281 {
 282   int c;
 283
 284   CHECK_CHARACTER (ch);
 285   c = XFASTINT (ch);
 286   if (c >= 0x100)
 287     error ("Not a unibyte character: %d", c);
 288   MAKE_CHAR_MULTIBYTE (c);
 289   return make_number (c);
 290 }
 291
 292 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 293        Smultibyte_char_to_unibyte, 1, 1, 0,
 294        doc: /* Convert the multibyte character CH to a byte.
 295 If the multibyte character does not represent a byte, return -1.  */)
 296   (Lisp_Object ch)
 297 {
 298   int cm;
 299
 300   CHECK_CHARACTER (ch);
 301   cm = XFASTINT (ch);
 302   if (cm < 256)
 303     /* Can't distinguish a byte read from a unibyte buffer from
 304        a latin1 char, so let's let it slide.  */
 305     return ch;
 306   else
 307     {
 308       int cu = CHAR_TO_BYTE_SAFE (cm);
 309       return make_number (cu);
 310     }
 311 }
 312
 313
 314 /* Return width (columns) of C considering the buffer display table DP. */
 315
 316 static ptrdiff_t
 317 char_width (int c, struct Lisp_Char_Table *dp)
 318 {
 319   ptrdiff_t width = CHAR_WIDTH (c);
 320
 321   if (dp)
 322     {
 323       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 324       int i;
 325
 326       if (VECTORP (disp))
 327         for (i = 0, width = 0; i < ASIZE (disp); i++)
 328           {
 329             ch = AREF (disp, i);
 330             if (CHARACTERP (ch))
 331               {
 332                 int w = CHAR_WIDTH (XFASTINT (ch));
 333                 if (INT_ADD_OVERFLOW (width, w))
 334                   string_overflow ();
 335                 width += w;
 336               }
 337           }
 338     }
 339   return width;
 340 }
 341
 342
 343 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 344        doc: /* Return width of CHAR when displayed in the current buffer.
 345 The width is measured by how many columns it occupies on the screen.
 346 Tab is taken to occupy `tab-width' columns.
 347 usage: (char-width CHAR)  */)
 348   (Lisp_Object ch)
 349 {
 350   int c;
 351   ptrdiff_t width;
 352
 353   CHECK_CHARACTER (ch);
 354   c = XINT (ch);
 355   width = char_width (c, buffer_display_table ());
 356   return make_number (width);
 357 }
 358
 359 /* Return width of string STR of length LEN when displayed in the
 360    current buffer.  The width is measured by how many columns it
 361    occupies on the screen.  If PRECISION > 0, return the width of
 362    longest substring that doesn't exceed PRECISION, and set number of
 363    characters and bytes of the substring in *NCHARS and *NBYTES
 364    respectively.  */
 365
 366 ptrdiff_t
 367 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 368                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 369 {
 370   ptrdiff_t i = 0, i_byte = 0;
 371   ptrdiff_t width = 0;
 372   struct Lisp_Char_Table *dp = buffer_display_table ();
 373
 374   while (i_byte < len)
 375     {
 376       int bytes;
 377       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 378       ptrdiff_t thiswidth = char_width (c, dp);
 379
 380       if (precision <= 0)
 381         {
 382           if (INT_ADD_OVERFLOW (width, thiswidth))
 383             string_overflow ();
 384         }
 385       else if (precision - width < thiswidth)
 386         {
 387           *nchars = i;
 388           *nbytes = i_byte;
 389           return width;
 390         }
 391       i++;
 392       i_byte += bytes;
 393       width += thiswidth;
 394   }
 395
 396   if (precision > 0)
 397     {
 398       *nchars = i;
 399       *nbytes = i_byte;
 400     }
 401
 402   return width;
 403 }
 404
 405 /* Return width of string STR of length LEN when displayed in the
 406    current buffer.  The width is measured by how many columns it
 407    occupies on the screen.  */
 408
 409 ptrdiff_t
 410 strwidth (const char *str, ptrdiff_t len)
 411 {
 412   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 413 }
 414
 415 /* Return width of Lisp string STRING when displayed in the current
 416    buffer.  The width is measured by how many columns it occupies on
 417    the screen while paying attention to compositions.  If PRECISION >
 418    0, return the width of longest substring that doesn't exceed
 419    PRECISION, and set number of characters and bytes of the substring
 420    in *NCHARS and *NBYTES respectively.  */
 421
 422 ptrdiff_t
 423 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 424                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 425 {
 426   ptrdiff_t len = SCHARS (string);
 427   /* This set multibyte to 0 even if STRING is multibyte when it
 428      contains only ascii and eight-bit-graphic, but that's
 429      intentional.  */
 430   bool multibyte = len < SBYTES (string);
 431   unsigned char *str = SDATA (string);
 432   ptrdiff_t i = 0, i_byte = 0;
 433   ptrdiff_t width = 0;
 434   struct Lisp_Char_Table *dp = buffer_display_table ();
 435
 436   while (i < len)
 437     {
 438       ptrdiff_t chars, bytes, thiswidth;
 439       Lisp_Object val;
 440       ptrdiff_t cmp_id;
 441       ptrdiff_t ignore, end;
 442
 443       if (find_composition (i, -1, &ignore, &end, &val, string)
 444           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 445               >= 0))
 446         {
 447           thiswidth = composition_table[cmp_id]->width;
 448           chars = end - i;
 449           bytes = string_char_to_byte (string, end) - i_byte;
 450         }
 451       else
 452         {
 453           int c;
 454
 455           if (multibyte)
 456             {
 457               int cbytes;
 458               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 459               bytes = cbytes;
 460             }
 461           else
 462             c = str[i_byte], bytes = 1;
 463           chars = 1;
 464           thiswidth = char_width (c, dp);
 465         }
 466
 467       if (precision <= 0)
 468         {
 469 #ifdef emacs
 470           if (INT_ADD_OVERFLOW (width, thiswidth))
 471             string_overflow ();
 472 #endif
 473         }
 474       else if (precision - width < thiswidth)
 475         {
 476           *nchars = i;
 477           *nbytes = i_byte;
 478           return width;
 479         }
 480       i += chars;
 481       i_byte += bytes;
 482       width += thiswidth;
 483     }
 484
 485   if (precision > 0)
 486     {
 487       *nchars = i;
 488       *nbytes = i_byte;
 489     }
 490
 491   return width;
 492 }
 493
 494 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 495        doc: /* Return width of STRING when displayed in the current buffer.
 496 Width is measured by how many columns it occupies on the screen.
 497 When calculating width of a multibyte character in STRING,
 498 only the base leading-code is considered; the validity of
 499 the following bytes is not checked.  Tabs in STRING are always
 500 taken to occupy `tab-width' columns.
 501 usage: (string-width STRING)  */)
 502   (Lisp_Object str)
 503 {
 504   Lisp_Object val;
 505
 506   CHECK_STRING (str);
 507   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 508   return val;
 509 }
 510
 511 /* Return the number of characters in the NBYTES bytes at PTR.
 512    This works by looking at the contents and checking for multibyte
 513    sequences while assuming that there's no invalid sequence.
 514    However, if the current buffer has enable-multibyte-characters =
 515    nil, we treat each byte as a character.  */
 516
 517 ptrdiff_t
 518 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 519 {
 520   /* current_buffer is null at early stages of Emacs initialization.  */
 521   if (current_buffer == 0
 522       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 523     return nbytes;
 524
 525   return multibyte_chars_in_text (ptr, nbytes);
 526 }
 527
 528 /* Return the number of characters in the NBYTES bytes at PTR.
 529    This works by looking at the contents and checking for multibyte
 530    sequences while assuming that there's no invalid sequence.  It
 531    ignores enable-multibyte-characters.  */
 532
 533 ptrdiff_t
 534 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 535 {
 536   const unsigned char *endp = ptr + nbytes;
 537   ptrdiff_t chars = 0;
 538
 539   while (ptr < endp)
 540     {
 541       int len = MULTIBYTE_LENGTH (ptr, endp);
 542
 543       if (len == 0)
 544         abort ();
 545       ptr += len;
 546       chars++;
 547     }
 548
 549   return chars;
 550 }
 551
 552 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 553    characters and bytes in it, and store them in *NCHARS and *NBYTES
 554    respectively.  On counting bytes, pay attention to that 8-bit
 555    characters not constructing a valid multibyte sequence are
 556    represented by 2-byte in a multibyte text.  */
 557
 558 void
 559 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 560                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 561 {
 562   const unsigned char *endp = str + len;
 563   int n;
 564   ptrdiff_t chars = 0, bytes = 0;
 565
 566   if (len >= MAX_MULTIBYTE_LENGTH)
 567     {
 568       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 569       while (str < adjusted_endp)
 570         {
 571           if (! CHAR_BYTE8_HEAD_P (*str)
 572               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 573             str += n, bytes += n;
 574           else
 575             str++, bytes += 2;
 576           chars++;
 577         }
 578     }
 579   while (str < endp)
 580     {
 581       if (! CHAR_BYTE8_HEAD_P (*str)
 582           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 583         str += n, bytes += n;
 584       else
 585         str++, bytes += 2;
 586       chars++;
 587     }
 588
 589   *nchars = chars;
 590   *nbytes = bytes;
 591   return;
 592 }
 593
 594 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 595    It actually converts only such 8-bit characters that don't construct
 596    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 597    NCHARS is nonzero, set *NCHARS to the number of characters in the
 598    text.  It is assured that we can use LEN bytes at STR as a work
 599    area and that is enough.  Return the number of bytes of the
 600    resulting text.  */
 601
 602 ptrdiff_t
 603 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 604                   ptrdiff_t *nchars)
 605 {
 606   unsigned char *p = str, *endp = str + nbytes;
 607   unsigned char *to;
 608   ptrdiff_t chars = 0;
 609   int n;
 610
 611   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 612     {
 613       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 614       while (p < adjusted_endp
 615              && ! CHAR_BYTE8_HEAD_P (*p)
 616              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 617         p += n, chars++;
 618     }
 619   while (p < endp
 620          && ! CHAR_BYTE8_HEAD_P (*p)
 621          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 622     p += n, chars++;
 623   if (nchars)
 624     *nchars = chars;
 625   if (p == endp)
 626     return nbytes;
 627
 628   to = p;
 629   nbytes = endp - p;
 630   endp = str + len;
 631   memmove (endp - nbytes, p, nbytes);
 632   p = endp - nbytes;
 633
 634   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 635     {
 636       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 637       while (p < adjusted_endp)
 638         {
 639           if (! CHAR_BYTE8_HEAD_P (*p)
 640               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 641             {
 642               while (n--)
 643                 *to++ = *p++;
 644             }
 645           else
 646             {
 647               int c = *p++;
 648               c = BYTE8_TO_CHAR (c);
 649               to += CHAR_STRING (c, to);
 650             }
 651         }
 652       chars++;
 653     }
 654   while (p < endp)
 655     {
 656       if (! CHAR_BYTE8_HEAD_P (*p)
 657           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 658         {
 659           while (n--)
 660             *to++ = *p++;
 661         }
 662       else
 663         {
 664           int c = *p++;
 665           c = BYTE8_TO_CHAR (c);
 666           to += CHAR_STRING (c, to);
 667         }
 668       chars++;
 669     }
 670   if (nchars)
 671     *nchars = chars;
 672   return (to - str);
 673 }
 674
 675 /* Parse unibyte string at STR of LEN bytes, and return the number of
 676    bytes it may occupy when converted to multibyte string by
 677    `str_to_multibyte'.  */
 678
 679 ptrdiff_t
 680 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 681 {
 682   const unsigned char *endp = str + len;
 683   ptrdiff_t bytes;
 684
 685   for (bytes = 0; str < endp; str++)
 686     {
 687       int n = *str < 0x80 ? 1 : 2;
 688       if (INT_ADD_OVERFLOW (bytes, n))
 689         string_overflow ();
 690       bytes += n;
 691     }
 692   return bytes;
 693 }
 694
 695
 696 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 697    that contains the same single-byte characters.  It actually
 698    converts all 8-bit characters to multibyte forms.  It is assured
 699    that we can use LEN bytes at STR as a work area and that is
 700    enough.  */
 701
 702 ptrdiff_t
 703 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 704 {
 705   unsigned char *p = str, *endp = str + bytes;
 706   unsigned char *to;
 707
 708   while (p < endp && *p < 0x80) p++;
 709   if (p == endp)
 710     return bytes;
 711   to = p;
 712   bytes = endp - p;
 713   endp = str + len;
 714   memmove (endp - bytes, p, bytes);
 715   p = endp - bytes;
 716   while (p < endp)
 717     {
 718       int c = *p++;
 719
 720       if (c >= 0x80)
 721         c = BYTE8_TO_CHAR (c);
 722       to += CHAR_STRING (c, to);
 723     }
 724   return (to - str);
 725 }
 726
 727 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 728    actually converts characters in the range 0x80..0xFF to
 729    unibyte.  */
 730
 731 ptrdiff_t
 732 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 733 {
 734   const unsigned char *p = str, *endp = str + bytes;
 735   unsigned char *to;
 736   int c, len;
 737
 738   while (p < endp)
 739     {
 740       c = *p;
 741       len = BYTES_BY_CHAR_HEAD (c);
 742       if (CHAR_BYTE8_HEAD_P (c))
 743         break;
 744       p += len;
 745     }
 746   to = str + (p - str);
 747   while (p < endp)
 748     {
 749       c = *p;
 750       len = BYTES_BY_CHAR_HEAD (c);
 751       if (CHAR_BYTE8_HEAD_P (c))
 752         {
 753           c = STRING_CHAR_ADVANCE (p);
 754           *to++ = CHAR_TO_BYTE8 (c);
 755         }
 756       else
 757         {
 758           while (len--) *to++ = *p++;
 759         }
 760     }
 761   return (to - str);
 762 }
 763
 764 /* Convert eight-bit chars in SRC (in multibyte form) to the
 765    corresponding byte and store in DST.  CHARS is the number of
 766    characters in SRC.  The value is the number of bytes stored in DST.
 767    Usually, the value is the same as CHARS, but is less than it if SRC
 768    contains a non-ASCII, non-eight-bit character.  */
 769
 770 ptrdiff_t
 771 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars)
 772 {
 773   ptrdiff_t i;
 774
 775   for (i = 0; i < chars; i++)
 776     {
 777       int c = STRING_CHAR_ADVANCE (src);
 778
 779       if (CHAR_BYTE8_P (c))
 780         c = CHAR_TO_BYTE8 (c);
 781       else if (! ASCII_CHAR_P (c))
 782         return i;
 783       *dst++ = c;
 784     }
 785   return i;
 786 }
 787
 788
 789 static ptrdiff_t
 790 string_count_byte8 (Lisp_Object string)
 791 {
 792   bool multibyte = STRING_MULTIBYTE (string);
 793   ptrdiff_t nbytes = SBYTES (string);
 794   unsigned char *p = SDATA (string);
 795   unsigned char *pend = p + nbytes;
 796   ptrdiff_t count = 0;
 797   int c, len;
 798
 799   if (multibyte)
 800     while (p < pend)
 801       {
 802         c = *p;
 803         len = BYTES_BY_CHAR_HEAD (c);
 804
 805         if (CHAR_BYTE8_HEAD_P (c))
 806           count++;
 807         p += len;
 808       }
 809   else
 810     while (p < pend)
 811       {
 812         if (*p++ >= 0x80)
 813           count++;
 814       }
 815   return count;
 816 }
 817
 818
 819 Lisp_Object
 820 string_escape_byte8 (Lisp_Object string)
 821 {
 822   ptrdiff_t nchars = SCHARS (string);
 823   ptrdiff_t nbytes = SBYTES (string);
 824   bool multibyte = STRING_MULTIBYTE (string);
 825   ptrdiff_t byte8_count;
 826   const unsigned char *src, *src_end;
 827   unsigned char *dst;
 828   Lisp_Object val;
 829   int c, len;
 830
 831   if (multibyte && nchars == nbytes)
 832     return string;
 833
 834   byte8_count = string_count_byte8 (string);
 835
 836   if (byte8_count == 0)
 837     return string;
 838
 839   if (multibyte)
 840     {
 841       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 842           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 843         string_overflow ();
 844
 845       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 846       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 847                                           nbytes + byte8_count * 2);
 848     }
 849   else
 850     {
 851       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 852         string_overflow ();
 853
 854       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 855       val = make_uninit_string (nbytes + byte8_count * 3);
 856     }
 857
 858   src = SDATA (string);
 859   src_end = src + nbytes;
 860   dst = SDATA (val);
 861   if (multibyte)
 862     while (src < src_end)
 863       {
 864         c = *src;
 865         len = BYTES_BY_CHAR_HEAD (c);
 866
 867         if (CHAR_BYTE8_HEAD_P (c))
 868           {
 869             c = STRING_CHAR_ADVANCE (src);
 870             c = CHAR_TO_BYTE8 (c);
 871             dst += sprintf ((char *) dst, "\\%03o", c);
 872           }
 873         else
 874           while (len--) *dst++ = *src++;
 875       }
 876   else
 877     while (src < src_end)
 878       {
 879         c = *src++;
 880         if (c >= 0x80)
 881           dst += sprintf ((char *) dst, "\\%03o", c);
 882         else
 883           *dst++ = c;
 884       }
 885   return val;
 886 }
 887
 888 \f
 889 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 890        doc: /*
 891 Concatenate all the argument characters and make the result a string.
 892 usage: (string &rest CHARACTERS)  */)
 893   (ptrdiff_t n, Lisp_Object *args)
 894 {
 895   ptrdiff_t i;
 896   int c;
 897   unsigned char *buf, *p;
 898   Lisp_Object str;
 899   USE_SAFE_ALLOCA;
 900
 901   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 902   p = buf;
 903
 904   for (i = 0; i < n; i++)
 905     {
 906       CHECK_CHARACTER (args[i]);
 907       c = XINT (args[i]);
 908       p += CHAR_STRING (c, p);
 909     }
 910
 911   str = make_string_from_bytes ((char *) buf, n, p - buf);
 912   SAFE_FREE ();
 913   return str;
 914 }
 915
 916 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 917        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 918 usage: (unibyte-string &rest BYTES)  */)
 919   (ptrdiff_t n, Lisp_Object *args)
 920 {
 921   ptrdiff_t i;
 922   Lisp_Object str;
 923   USE_SAFE_ALLOCA;
 924   unsigned char *buf = SAFE_ALLOCA (n);
 925   unsigned char *p = buf;
 926
 927   for (i = 0; i < n; i++)
 928     {
 929       CHECK_RANGED_INTEGER (args[i], 0, 255);
 930       *p++ = XINT (args[i]);
 931     }
 932
 933   str = make_string_from_bytes ((char *) buf, n, p - buf);
 934   SAFE_FREE ();
 935   return str;
 936 }
 937
 938 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 939        Schar_resolve_modifiers, 1, 1, 0,
 940        doc: /* Resolve modifiers in the character CHAR.
 941 The value is a character with modifiers resolved into the character
 942 code.  Unresolved modifiers are kept in the value.
 943 usage: (char-resolve-modifiers CHAR)  */)
 944   (Lisp_Object character)
 945 {
 946   EMACS_INT c;
 947
 948   CHECK_NUMBER (character);
 949   c = XINT (character);
 950   return make_number (char_resolve_modifier_mask (c));
 951 }
 952
 953 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 954        doc: /* Return a byte value of a character at point.
 955 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 956 a byte value.
 957 Optional 2nd arg STRING, if non-nil, is a string of which first
 958 character is a target to get a byte value.  In this case, POSITION, if
 959 non-nil, is an index of a target character in the string.
 960
 961 If the current buffer (or STRING) is multibyte, and the target
 962 character is not ASCII nor 8-bit character, an error is signaled.  */)
 963   (Lisp_Object position, Lisp_Object string)
 964 {
 965   int c;
 966   ptrdiff_t pos;
 967   unsigned char *p;
 968
 969   if (NILP (string))
 970     {
 971       if (NILP (position))
 972         {
 973           p = PT_ADDR;
 974         }
 975       else
 976         {
 977           CHECK_NUMBER_COERCE_MARKER (position);
 978           if (XINT (position) < BEGV || XINT (position) >= ZV)
 979             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 980           pos = XFASTINT (position);
 981           p = CHAR_POS_ADDR (pos);
 982         }
 983       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 984         return make_number (*p);
 985     }
 986   else
 987     {
 988       CHECK_STRING (string);
 989       if (NILP (position))
 990         {
 991           p = SDATA (string);
 992         }
 993       else
 994         {
 995           CHECK_NATNUM (position);
 996           if (XINT (position) >= SCHARS (string))
 997             args_out_of_range (string, position);
 998           pos = XFASTINT (position);
 999           p = SDATA (string) + string_char_to_byte (string, pos);
1000         }
1001       if (! STRING_MULTIBYTE (string))
1002         return make_number (*p);
1003     }
1004   c = STRING_CHAR (p);
1005   if (CHAR_BYTE8_P (c))
1006     c = CHAR_TO_BYTE8 (c);
1007   else if (! ASCII_CHAR_P (c))
1008     error ("Not an ASCII nor an 8-bit character: %d", c);
1009   return make_number (c);
1010 }
1011
1012 #ifdef emacs
1013
1014 void
1015 syms_of_character (void)
1016 {
1017   DEFSYM (Qcharacterp, "characterp");
1018   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1019
1020   staticpro (&Vchar_unify_table);
1021   Vchar_unify_table = Qnil;
1022
1023   defsubr (&Smax_char);
1024   defsubr (&Scharacterp);
1025   defsubr (&Sunibyte_char_to_multibyte);
1026   defsubr (&Smultibyte_char_to_unibyte);
1027   defsubr (&Schar_width);
1028   defsubr (&Sstring_width);
1029   defsubr (&Sstring);
1030   defsubr (&Sunibyte_string);
1031   defsubr (&Schar_resolve_modifiers);
1032   defsubr (&Sget_byte);
1033
1034   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1035                doc: /*
1036 Vector recording all translation tables ever defined.
1037 Each element is a pair (SYMBOL . TABLE) relating the table to the
1038 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1039   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1040
1041   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1042                doc: /*
1043 A char-table for characters which invoke auto-filling.
1044 Such characters have value t in this table.  */);
1045   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1046   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1047   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1048
1049   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1050                doc: /*
1051 A char-table for width (columns) of each character.  */);
1052   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1053   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1054   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1055                         make_number (4));
1056
1057   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1058                doc: /* A char-table for each printable character.  */);
1059   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1060   Fset_char_table_range (Vprintable_chars,
1061                          Fcons (make_number (32), make_number (126)), Qt);
1062   Fset_char_table_range (Vprintable_chars,
1063                          Fcons (make_number (160),
1064                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1065
1066   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1067                doc: /* Char table of script symbols.
1068 It has one extra slot whose value is a list of script symbols.  */);
1069
1070   /* Intern this now in case it isn't already done.
1071      Setting this variable twice is harmless.
1072      But don't staticpro it here--that is done in alloc.c.  */
1073   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1074   DEFSYM (Qchar_script_table, "char-script-table");
1075   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1076   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1077
1078   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1079                doc: /* Alist of scripts vs the representative characters.
1080 Each element is a cons (SCRIPT . CHARS).
1081 SCRIPT is a symbol representing a script or a subgroup of a script.
1082 CHARS is a list or a vector of characters.
1083 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1084 If it is a vector, one of the characters in the vector is necessary.
1085 This variable is used to find a font for a specific script.  */);
1086   Vscript_representative_chars = Qnil;
1087
1088   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1089                doc: /* Char table of Unicode's "General Category".
1090 All Unicode characters have one of the following values (symbol):
1091   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1092   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1093 See The Unicode Standard for the meaning of those values.  */);
1094   /* The correct char-table is setup in characters.el.  */
1095   Vunicode_category_table = Qnil;
1096 }
1097
1098 #endif /* emacs */