src/character.c

   1 /* Basic character support.
   2
   3 Copyright (C) 2001-2012  Free Software Foundation, Inc.
   4 Copyright (C) 1995, 1997, 1998, 2001 Electrotechnical Laboratory, JAPAN.
   5   Licensed to the Free Software Foundation.
   6 Copyright (C) 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011
   7   National Institute of Advanced Industrial Science and Technology (AIST)
   8   Registration Number H13PRO009
   9
  10 This file is part of GNU Emacs.
  11
  12 GNU Emacs is free software: you can redistribute it and/or modify
  13 it under the terms of the GNU General Public License as published by
  14 the Free Software Foundation, either version 3 of the License, or
  15 (at your option) any later version.
  16
  17 GNU Emacs is distributed in the hope that it will be useful,
  18 but WITHOUT ANY WARRANTY; without even the implied warranty of
  19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  20 GNU General Public License for more details.
  21
  22 You should have received a copy of the GNU General Public License
  23 along with GNU Emacs.  If not, see <http://www.gnu.org/licenses/>.  */
  24
  25 /* At first, see the document in `character.h' to understand the code
  26    in this file.  */
  27
  28 #ifdef emacs
  29 #include <config.h>
  30 #endif
  31
  32 #define CHARACTER_INLINE EXTERN_INLINE
  33
  34 #include <stdio.h>
  35
  36 #ifdef emacs
  37
  38 #include <sys/types.h>
  39 #include <setjmp.h>
  40 #include <intprops.h>
  41 #include "lisp.h"
  42 #include "character.h"
  43 #include "buffer.h"
  44 #include "charset.h"
  45 #include "composite.h"
  46 #include "disptab.h"
  47
  48 #else  /* not emacs */
  49
  50 #include "mulelib.h"
  51
  52 #endif /* emacs */
  53
  54 Lisp_Object Qcharacterp;
  55
  56 static Lisp_Object Qauto_fill_chars;
  57
  58 /* Char-table of information about which character to unify to which
  59    Unicode character.  Mainly used by the macro MAYBE_UNIFY_CHAR.  */
  60 Lisp_Object Vchar_unify_table;
  61
  62 static Lisp_Object Qchar_script_table;
  63
  64 \f
  65
  66 /* If character code C has modifier masks, reflect them to the
  67    character code if possible.  Return the resulting code.  */
  68
  69 EMACS_INT
  70 char_resolve_modifier_mask (EMACS_INT c)
  71 {
  72   /* A non-ASCII character can't reflect modifier bits to the code.  */
  73   if (! ASCII_CHAR_P ((c & ~CHAR_MODIFIER_MASK)))
  74     return c;
  75
  76   /* For Meta, Shift, and Control modifiers, we need special care.  */
  77   if (c & CHAR_SHIFT)
  78     {
  79       /* Shift modifier is valid only with [A-Za-z].  */
  80       if ((c & 0377) >= 'A' && (c & 0377) <= 'Z')
  81         c &= ~CHAR_SHIFT;
  82       else if ((c & 0377) >= 'a' && (c & 0377) <= 'z')
  83         c = (c & ~CHAR_SHIFT) - ('a' - 'A');
  84       /* Shift modifier for control characters and SPC is ignored.  */
  85       else if ((c & ~CHAR_MODIFIER_MASK) <= 0x20)
  86         c &= ~CHAR_SHIFT;
  87     }
  88   if (c & CHAR_CTL)
  89     {
  90       /* Simulate the code in lread.c.  */
  91       /* Allow `\C- ' and `\C-?'.  */
  92       if ((c & 0377) == ' ')
  93         c &= ~0177 & ~ CHAR_CTL;
  94       else if ((c & 0377) == '?')
  95         c = 0177 | (c & ~0177 & ~CHAR_CTL);
  96       /* ASCII control chars are made from letters (both cases),
  97          as well as the non-letters within 0100...0137.  */
  98       else if ((c & 0137) >= 0101 && (c & 0137) <= 0132)
  99         c &= (037 | (~0177 & ~CHAR_CTL));
 100       else if ((c & 0177) >= 0100 && (c & 0177) <= 0137)
 101         c &= (037 | (~0177 & ~CHAR_CTL));
 102     }
 103 #if 0   /* This is outside the scope of this function.  (bug#4751)  */
 104   if (c & CHAR_META)
 105     {
 106       /* Move the meta bit to the right place for a string.  */
 107       c = (c & ~CHAR_META) | 0x80;
 108     }
 109 #endif
 110
 111   return c;
 112 }
 113
 114
 115 /* Store multibyte form of character C at P.  If C has modifier bits,
 116    handle them appropriately.  */
 117
 118 int
 119 char_string (unsigned int c, unsigned char *p)
 120 {
 121   int bytes;
 122
 123   if (c & CHAR_MODIFIER_MASK)
 124     {
 125       c = char_resolve_modifier_mask (c);
 126       /* If C still has any modifier bits, just ignore it.  */
 127       c &= ~CHAR_MODIFIER_MASK;
 128     }
 129
 130   MAYBE_UNIFY_CHAR (c);
 131
 132   if (c <= MAX_3_BYTE_CHAR)
 133     {
 134       bytes = CHAR_STRING (c, p);
 135     }
 136   else if (c <= MAX_4_BYTE_CHAR)
 137     {
 138       p[0] = (0xF0 | (c >> 18));
 139       p[1] = (0x80 | ((c >> 12) & 0x3F));
 140       p[2] = (0x80 | ((c >> 6) & 0x3F));
 141       p[3] = (0x80 | (c & 0x3F));
 142       bytes = 4;
 143     }
 144   else if (c <= MAX_5_BYTE_CHAR)
 145     {
 146       p[0] = 0xF8;
 147       p[1] = (0x80 | ((c >> 18) & 0x0F));
 148       p[2] = (0x80 | ((c >> 12) & 0x3F));
 149       p[3] = (0x80 | ((c >> 6) & 0x3F));
 150       p[4] = (0x80 | (c & 0x3F));
 151       bytes = 5;
 152     }
 153   else if (c <= MAX_CHAR)
 154     {
 155       c = CHAR_TO_BYTE8 (c);
 156       bytes = BYTE8_STRING (c, p);
 157     }
 158   else
 159     error ("Invalid character: %x", c);
 160
 161   return bytes;
 162 }
 163
 164
 165 /* Return a character whose multibyte form is at P.  If LEN is not
 166    NULL, it must be a pointer to integer.  In that case, set *LEN to
 167    the byte length of the multibyte form.  If ADVANCED is not NULL, it
 168    must be a pointer to unsigned char.  In that case, set *ADVANCED to
 169    the ending address (i.e., the starting address of the next
 170    character) of the multibyte form.  */
 171
 172 int
 173 string_char (const unsigned char *p, const unsigned char **advanced, int *len)
 174 {
 175   int c;
 176   const unsigned char *saved_p = p;
 177
 178   if (*p < 0x80 || ! (*p & 0x20) || ! (*p & 0x10))
 179     {
 180       c = STRING_CHAR_ADVANCE (p);
 181     }
 182   else if (! (*p & 0x08))
 183     {
 184       c = ((((p)[0] & 0xF) << 18)
 185            | (((p)[1] & 0x3F) << 12)
 186            | (((p)[2] & 0x3F) << 6)
 187            | ((p)[3] & 0x3F));
 188       p += 4;
 189     }
 190   else
 191     {
 192       c = ((((p)[1] & 0x3F) << 18)
 193            | (((p)[2] & 0x3F) << 12)
 194            | (((p)[3] & 0x3F) << 6)
 195            | ((p)[4] & 0x3F));
 196       p += 5;
 197     }
 198
 199   MAYBE_UNIFY_CHAR (c);
 200
 201   if (len)
 202     *len = p - saved_p;
 203   if (advanced)
 204     *advanced = p;
 205   return c;
 206 }
 207
 208
 209 /* Translate character C by translation table TABLE.  If no translation is
 210    found in TABLE, return the untranslated character.  If TABLE is a list,
 211    elements are char tables.  In that case, recursively translate C by all the
 212    tables in the list.  */
 213
 214 int
 215 translate_char (Lisp_Object table, int c)
 216 {
 217   if (CHAR_TABLE_P (table))
 218     {
 219       Lisp_Object ch;
 220
 221       ch = CHAR_TABLE_REF (table, c);
 222       if (CHARACTERP (ch))
 223         c = XINT (ch);
 224     }
 225   else
 226     {
 227       for (; CONSP (table); table = XCDR (table))
 228         c = translate_char (XCAR (table), c);
 229     }
 230   return c;
 231 }
 232
 233 /* Convert ASCII or 8-bit character C to unibyte.  If C is none of
 234    them, return (C & 0xFF).  */
 235
 236 int
 237 multibyte_char_to_unibyte (int c)
 238 {
 239   if (c < 0x80)
 240     return c;
 241   if (CHAR_BYTE8_P (c))
 242     return CHAR_TO_BYTE8 (c);
 243   return (c & 0xFF);
 244 }
 245
 246 /* Like multibyte_char_to_unibyte, but return -1 if C is not supported
 247    by charset_unibyte.  */
 248
 249 int
 250 multibyte_char_to_unibyte_safe (int c)
 251 {
 252   if (c < 0x80)
 253     return c;
 254   if (CHAR_BYTE8_P (c))
 255     return CHAR_TO_BYTE8 (c);
 256   return -1;
 257 }
 258
 259 DEFUN ("characterp", Fcharacterp, Scharacterp, 1, 2, 0,
 260        doc: /* Return non-nil if OBJECT is a character.
 261 In Emacs Lisp, characters are represented by character codes, which
 262 are non-negative integers.  The function `max-char' returns the
 263 maximum character code.
 264 usage: (characterp OBJECT)  */)
 265   (Lisp_Object object, Lisp_Object ignore)
 266 {
 267   return (CHARACTERP (object) ? Qt : Qnil);
 268 }
 269
 270 DEFUN ("max-char", Fmax_char, Smax_char, 0, 0, 0,
 271        doc: /* Return the character of the maximum code.  */)
 272   (void)
 273 {
 274   return make_number (MAX_CHAR);
 275 }
 276
 277 DEFUN ("unibyte-char-to-multibyte", Funibyte_char_to_multibyte,
 278        Sunibyte_char_to_multibyte, 1, 1, 0,
 279        doc: /* Convert the byte CH to multibyte character.  */)
 280   (Lisp_Object ch)
 281 {
 282   int c;
 283
 284   CHECK_CHARACTER (ch);
 285   c = XFASTINT (ch);
 286   if (c >= 0x100)
 287     error ("Not a unibyte character: %d", c);
 288   MAKE_CHAR_MULTIBYTE (c);
 289   return make_number (c);
 290 }
 291
 292 DEFUN ("multibyte-char-to-unibyte", Fmultibyte_char_to_unibyte,
 293        Smultibyte_char_to_unibyte, 1, 1, 0,
 294        doc: /* Convert the multibyte character CH to a byte.
 295 If the multibyte character does not represent a byte, return -1.  */)
 296   (Lisp_Object ch)
 297 {
 298   int cm;
 299
 300   CHECK_CHARACTER (ch);
 301   cm = XFASTINT (ch);
 302   if (cm < 256)
 303     /* Can't distinguish a byte read from a unibyte buffer from
 304        a latin1 char, so let's let it slide.  */
 305     return ch;
 306   else
 307     {
 308       int cu = CHAR_TO_BYTE_SAFE (cm);
 309       return make_number (cu);
 310     }
 311 }
 312
 313
 314 /* Return width (columns) of C considering the buffer display table DP. */
 315
 316 static ptrdiff_t
 317 char_width (int c, struct Lisp_Char_Table *dp)
 318 {
 319   ptrdiff_t width = CHAR_WIDTH (c);
 320
 321   if (dp)
 322     {
 323       Lisp_Object disp = DISP_CHAR_VECTOR (dp, c), ch;
 324       int i;
 325
 326       if (VECTORP (disp))
 327         for (i = 0, width = 0; i < ASIZE (disp); i++)
 328           {
 329             ch = AREF (disp, i);
 330             if (CHARACTERP (ch))
 331               {
 332                 int w = CHAR_WIDTH (XFASTINT (ch));
 333                 if (INT_ADD_OVERFLOW (width, w))
 334                   string_overflow ();
 335                 width += w;
 336               }
 337           }
 338     }
 339   return width;
 340 }
 341
 342
 343 DEFUN ("char-width", Fchar_width, Schar_width, 1, 1, 0,
 344        doc: /* Return width of CHAR when displayed in the current buffer.
 345 The width is measured by how many columns it occupies on the screen.
 346 Tab is taken to occupy `tab-width' columns.
 347 usage: (char-width CHAR)  */)
 348   (Lisp_Object ch)
 349 {
 350   int c;
 351   ptrdiff_t width;
 352
 353   CHECK_CHARACTER (ch);
 354   c = XINT (ch);
 355   width = char_width (c, buffer_display_table ());
 356   return make_number (width);
 357 }
 358
 359 /* Return width of string STR of length LEN when displayed in the
 360    current buffer.  The width is measured by how many columns it
 361    occupies on the screen.  If PRECISION > 0, return the width of
 362    longest substring that doesn't exceed PRECISION, and set number of
 363    characters and bytes of the substring in *NCHARS and *NBYTES
 364    respectively.  */
 365
 366 ptrdiff_t
 367 c_string_width (const unsigned char *str, ptrdiff_t len, int precision,
 368                 ptrdiff_t *nchars, ptrdiff_t *nbytes)
 369 {
 370   ptrdiff_t i = 0, i_byte = 0;
 371   ptrdiff_t width = 0;
 372   struct Lisp_Char_Table *dp = buffer_display_table ();
 373
 374   while (i_byte < len)
 375     {
 376       int bytes;
 377       int c = STRING_CHAR_AND_LENGTH (str + i_byte, bytes);
 378       ptrdiff_t thiswidth = char_width (c, dp);
 379
 380       if (precision <= 0)
 381         {
 382           if (INT_ADD_OVERFLOW (width, thiswidth))
 383             string_overflow ();
 384         }
 385       else if (precision - width < thiswidth)
 386         {
 387           *nchars = i;
 388           *nbytes = i_byte;
 389           return width;
 390         }
 391       i++;
 392       i_byte += bytes;
 393       width += thiswidth;
 394   }
 395
 396   if (precision > 0)
 397     {
 398       *nchars = i;
 399       *nbytes = i_byte;
 400     }
 401
 402   return width;
 403 }
 404
 405 /* Return width of string STR of length LEN when displayed in the
 406    current buffer.  The width is measured by how many columns it
 407    occupies on the screen.  */
 408
 409 ptrdiff_t
 410 strwidth (const char *str, ptrdiff_t len)
 411 {
 412   return c_string_width ((const unsigned char *) str, len, -1, NULL, NULL);
 413 }
 414
 415 /* Return width of Lisp string STRING when displayed in the current
 416    buffer.  The width is measured by how many columns it occupies on
 417    the screen while paying attention to compositions.  If PRECISION >
 418    0, return the width of longest substring that doesn't exceed
 419    PRECISION, and set number of characters and bytes of the substring
 420    in *NCHARS and *NBYTES respectively.  */
 421
 422 ptrdiff_t
 423 lisp_string_width (Lisp_Object string, ptrdiff_t precision,
 424                    ptrdiff_t *nchars, ptrdiff_t *nbytes)
 425 {
 426   ptrdiff_t len = SCHARS (string);
 427   /* This set multibyte to 0 even if STRING is multibyte when it
 428      contains only ascii and eight-bit-graphic, but that's
 429      intentional.  */
 430   int multibyte = len < SBYTES (string);
 431   unsigned char *str = SDATA (string);
 432   ptrdiff_t i = 0, i_byte = 0;
 433   ptrdiff_t width = 0;
 434   struct Lisp_Char_Table *dp = buffer_display_table ();
 435
 436   while (i < len)
 437     {
 438       ptrdiff_t chars, bytes, thiswidth;
 439       Lisp_Object val;
 440       ptrdiff_t cmp_id;
 441       ptrdiff_t ignore, end;
 442
 443       if (find_composition (i, -1, &ignore, &end, &val, string)
 444           && ((cmp_id = get_composition_id (i, i_byte, end - i, val, string))
 445               >= 0))
 446         {
 447           thiswidth = composition_table[cmp_id]->width;
 448           chars = end - i;
 449           bytes = string_char_to_byte (string, end) - i_byte;
 450         }
 451       else
 452         {
 453           int c;
 454
 455           if (multibyte)
 456             {
 457               int cbytes;
 458               c = STRING_CHAR_AND_LENGTH (str + i_byte, cbytes);
 459               bytes = cbytes;
 460             }
 461           else
 462             c = str[i_byte], bytes = 1;
 463           chars = 1;
 464           thiswidth = char_width (c, dp);
 465         }
 466
 467       if (precision <= 0)
 468         {
 469 #ifdef emacs
 470           if (INT_ADD_OVERFLOW (width, thiswidth))
 471             string_overflow ();
 472 #endif
 473         }
 474       else if (precision - width < thiswidth)
 475         {
 476           *nchars = i;
 477           *nbytes = i_byte;
 478           return width;
 479         }
 480       i += chars;
 481       i_byte += bytes;
 482       width += thiswidth;
 483     }
 484
 485   if (precision > 0)
 486     {
 487       *nchars = i;
 488       *nbytes = i_byte;
 489     }
 490
 491   return width;
 492 }
 493
 494 DEFUN ("string-width", Fstring_width, Sstring_width, 1, 1, 0,
 495        doc: /* Return width of STRING when displayed in the current buffer.
 496 Width is measured by how many columns it occupies on the screen.
 497 When calculating width of a multibyte character in STRING,
 498 only the base leading-code is considered; the validity of
 499 the following bytes is not checked.  Tabs in STRING are always
 500 taken to occupy `tab-width' columns.
 501 usage: (string-width STRING)  */)
 502   (Lisp_Object str)
 503 {
 504   Lisp_Object val;
 505
 506   CHECK_STRING (str);
 507   XSETFASTINT (val, lisp_string_width (str, -1, NULL, NULL));
 508   return val;
 509 }
 510
 511 /* Return the number of characters in the NBYTES bytes at PTR.
 512    This works by looking at the contents and checking for multibyte
 513    sequences while assuming that there's no invalid sequence.
 514    However, if the current buffer has enable-multibyte-characters =
 515    nil, we treat each byte as a character.  */
 516
 517 ptrdiff_t
 518 chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 519 {
 520   /* current_buffer is null at early stages of Emacs initialization.  */
 521   if (current_buffer == 0
 522       || NILP (BVAR (current_buffer, enable_multibyte_characters)))
 523     return nbytes;
 524
 525   return multibyte_chars_in_text (ptr, nbytes);
 526 }
 527
 528 /* Return the number of characters in the NBYTES bytes at PTR.
 529    This works by looking at the contents and checking for multibyte
 530    sequences while assuming that there's no invalid sequence.  It
 531    ignores enable-multibyte-characters.  */
 532
 533 ptrdiff_t
 534 multibyte_chars_in_text (const unsigned char *ptr, ptrdiff_t nbytes)
 535 {
 536   const unsigned char *endp = ptr + nbytes;
 537   ptrdiff_t chars = 0;
 538
 539   while (ptr < endp)
 540     {
 541       int len = MULTIBYTE_LENGTH (ptr, endp);
 542
 543       if (len == 0)
 544         abort ();
 545       ptr += len;
 546       chars++;
 547     }
 548
 549   return chars;
 550 }
 551
 552 /* Parse unibyte text at STR of LEN bytes as a multibyte text, count
 553    characters and bytes in it, and store them in *NCHARS and *NBYTES
 554    respectively.  On counting bytes, pay attention to that 8-bit
 555    characters not constructing a valid multibyte sequence are
 556    represented by 2-byte in a multibyte text.  */
 557
 558 void
 559 parse_str_as_multibyte (const unsigned char *str, ptrdiff_t len,
 560                         ptrdiff_t *nchars, ptrdiff_t *nbytes)
 561 {
 562   const unsigned char *endp = str + len;
 563   int n;
 564   ptrdiff_t chars = 0, bytes = 0;
 565
 566   if (len >= MAX_MULTIBYTE_LENGTH)
 567     {
 568       const unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 569       while (str < adjusted_endp)
 570         {
 571           if (! CHAR_BYTE8_HEAD_P (*str)
 572               && (n = MULTIBYTE_LENGTH_NO_CHECK (str)) > 0)
 573             str += n, bytes += n;
 574           else
 575             str++, bytes += 2;
 576           chars++;
 577         }
 578     }
 579   while (str < endp)
 580     {
 581       if (! CHAR_BYTE8_HEAD_P (*str)
 582           && (n = MULTIBYTE_LENGTH (str, endp)) > 0)
 583         str += n, bytes += n;
 584       else
 585         str++, bytes += 2;
 586       chars++;
 587     }
 588
 589   *nchars = chars;
 590   *nbytes = bytes;
 591   return;
 592 }
 593
 594 /* Arrange unibyte text at STR of NBYTES bytes as a multibyte text.
 595    It actually converts only such 8-bit characters that don't construct
 596    a multibyte sequence to multibyte forms of Latin-1 characters.  If
 597    NCHARS is nonzero, set *NCHARS to the number of characters in the
 598    text.  It is assured that we can use LEN bytes at STR as a work
 599    area and that is enough.  Return the number of bytes of the
 600    resulting text.  */
 601
 602 ptrdiff_t
 603 str_as_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t nbytes,
 604                   ptrdiff_t *nchars)
 605 {
 606   unsigned char *p = str, *endp = str + nbytes;
 607   unsigned char *to;
 608   ptrdiff_t chars = 0;
 609   int n;
 610
 611   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 612     {
 613       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 614       while (p < adjusted_endp
 615              && ! CHAR_BYTE8_HEAD_P (*p)
 616              && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 617         p += n, chars++;
 618     }
 619   while (p < endp
 620          && ! CHAR_BYTE8_HEAD_P (*p)
 621          && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 622     p += n, chars++;
 623   if (nchars)
 624     *nchars = chars;
 625   if (p == endp)
 626     return nbytes;
 627
 628   to = p;
 629   nbytes = endp - p;
 630   endp = str + len;
 631   memmove (endp - nbytes, p, nbytes);
 632   p = endp - nbytes;
 633
 634   if (nbytes >= MAX_MULTIBYTE_LENGTH)
 635     {
 636       unsigned char *adjusted_endp = endp - MAX_MULTIBYTE_LENGTH;
 637       while (p < adjusted_endp)
 638         {
 639           if (! CHAR_BYTE8_HEAD_P (*p)
 640               && (n = MULTIBYTE_LENGTH_NO_CHECK (p)) > 0)
 641             {
 642               while (n--)
 643                 *to++ = *p++;
 644             }
 645           else
 646             {
 647               int c = *p++;
 648               c = BYTE8_TO_CHAR (c);
 649               to += CHAR_STRING (c, to);
 650             }
 651         }
 652       chars++;
 653     }
 654   while (p < endp)
 655     {
 656       if (! CHAR_BYTE8_HEAD_P (*p)
 657           && (n = MULTIBYTE_LENGTH (p, endp)) > 0)
 658         {
 659           while (n--)
 660             *to++ = *p++;
 661         }
 662       else
 663         {
 664           int c = *p++;
 665           c = BYTE8_TO_CHAR (c);
 666           to += CHAR_STRING (c, to);
 667         }
 668       chars++;
 669     }
 670   if (nchars)
 671     *nchars = chars;
 672   return (to - str);
 673 }
 674
 675 /* Parse unibyte string at STR of LEN bytes, and return the number of
 676    bytes it may occupy when converted to multibyte string by
 677    `str_to_multibyte'.  */
 678
 679 ptrdiff_t
 680 count_size_as_multibyte (const unsigned char *str, ptrdiff_t len)
 681 {
 682   const unsigned char *endp = str + len;
 683   ptrdiff_t bytes;
 684
 685   for (bytes = 0; str < endp; str++)
 686     {
 687       int n = *str < 0x80 ? 1 : 2;
 688       if (INT_ADD_OVERFLOW (bytes, n))
 689         string_overflow ();
 690       bytes += n;
 691     }
 692   return bytes;
 693 }
 694
 695
 696 /* Convert unibyte text at STR of BYTES bytes to a multibyte text
 697    that contains the same single-byte characters.  It actually
 698    converts all 8-bit characters to multibyte forms.  It is assured
 699    that we can use LEN bytes at STR as a work area and that is
 700    enough.  */
 701
 702 ptrdiff_t
 703 str_to_multibyte (unsigned char *str, ptrdiff_t len, ptrdiff_t bytes)
 704 {
 705   unsigned char *p = str, *endp = str + bytes;
 706   unsigned char *to;
 707
 708   while (p < endp && *p < 0x80) p++;
 709   if (p == endp)
 710     return bytes;
 711   to = p;
 712   bytes = endp - p;
 713   endp = str + len;
 714   memmove (endp - bytes, p, bytes);
 715   p = endp - bytes;
 716   while (p < endp)
 717     {
 718       int c = *p++;
 719
 720       if (c >= 0x80)
 721         c = BYTE8_TO_CHAR (c);
 722       to += CHAR_STRING (c, to);
 723     }
 724   return (to - str);
 725 }
 726
 727 /* Arrange multibyte text at STR of LEN bytes as a unibyte text.  It
 728    actually converts characters in the range 0x80..0xFF to
 729    unibyte.  */
 730
 731 ptrdiff_t
 732 str_as_unibyte (unsigned char *str, ptrdiff_t bytes)
 733 {
 734   const unsigned char *p = str, *endp = str + bytes;
 735   unsigned char *to;
 736   int c, len;
 737
 738   while (p < endp)
 739     {
 740       c = *p;
 741       len = BYTES_BY_CHAR_HEAD (c);
 742       if (CHAR_BYTE8_HEAD_P (c))
 743         break;
 744       p += len;
 745     }
 746   to = str + (p - str);
 747   while (p < endp)
 748     {
 749       c = *p;
 750       len = BYTES_BY_CHAR_HEAD (c);
 751       if (CHAR_BYTE8_HEAD_P (c))
 752         {
 753           c = STRING_CHAR_ADVANCE (p);
 754           *to++ = CHAR_TO_BYTE8 (c);
 755         }
 756       else
 757         {
 758           while (len--) *to++ = *p++;
 759         }
 760     }
 761   return (to - str);
 762 }
 763
 764 /* Convert eight-bit chars in SRC (in multibyte form) to the
 765    corresponding byte and store in DST.  CHARS is the number of
 766    characters in SRC.  The value is the number of bytes stored in DST.
 767    Usually, the value is the same as CHARS, but is less than it if SRC
 768    contains a non-ASCII, non-eight-bit character.  If ACCEPT_LATIN_1
 769    is nonzero, a Latin-1 character is accepted and converted to a byte
 770    of that character code.
 771    Note: Currently the arg ACCEPT_LATIN_1 is not used.  */
 772
 773 ptrdiff_t
 774 str_to_unibyte (const unsigned char *src, unsigned char *dst, ptrdiff_t chars, int accept_latin_1)
 775 {
 776   ptrdiff_t i;
 777
 778   for (i = 0; i < chars; i++)
 779     {
 780       int c = STRING_CHAR_ADVANCE (src);
 781
 782       if (CHAR_BYTE8_P (c))
 783         c = CHAR_TO_BYTE8 (c);
 784       else if (! ASCII_CHAR_P (c)
 785                && (! accept_latin_1 || c >= 0x100))
 786         return i;
 787       *dst++ = c;
 788     }
 789   return i;
 790 }
 791
 792
 793 static ptrdiff_t
 794 string_count_byte8 (Lisp_Object string)
 795 {
 796   int multibyte = STRING_MULTIBYTE (string);
 797   ptrdiff_t nbytes = SBYTES (string);
 798   unsigned char *p = SDATA (string);
 799   unsigned char *pend = p + nbytes;
 800   ptrdiff_t count = 0;
 801   int c, len;
 802
 803   if (multibyte)
 804     while (p < pend)
 805       {
 806         c = *p;
 807         len = BYTES_BY_CHAR_HEAD (c);
 808
 809         if (CHAR_BYTE8_HEAD_P (c))
 810           count++;
 811         p += len;
 812       }
 813   else
 814     while (p < pend)
 815       {
 816         if (*p++ >= 0x80)
 817           count++;
 818       }
 819   return count;
 820 }
 821
 822
 823 Lisp_Object
 824 string_escape_byte8 (Lisp_Object string)
 825 {
 826   ptrdiff_t nchars = SCHARS (string);
 827   ptrdiff_t nbytes = SBYTES (string);
 828   int multibyte = STRING_MULTIBYTE (string);
 829   ptrdiff_t byte8_count;
 830   const unsigned char *src, *src_end;
 831   unsigned char *dst;
 832   Lisp_Object val;
 833   int c, len;
 834
 835   if (multibyte && nchars == nbytes)
 836     return string;
 837
 838   byte8_count = string_count_byte8 (string);
 839
 840   if (byte8_count == 0)
 841     return string;
 842
 843   if (multibyte)
 844     {
 845       if ((MOST_POSITIVE_FIXNUM - nchars) / 3 < byte8_count
 846           || (STRING_BYTES_BOUND - nbytes) / 2 < byte8_count)
 847         string_overflow ();
 848
 849       /* Convert 2-byte sequence of byte8 chars to 4-byte octal.  */
 850       val = make_uninit_multibyte_string (nchars + byte8_count * 3,
 851                                           nbytes + byte8_count * 2);
 852     }
 853   else
 854     {
 855       if ((STRING_BYTES_BOUND - nbytes) / 3 < byte8_count)
 856         string_overflow ();
 857
 858       /* Convert 1-byte sequence of byte8 chars to 4-byte octal.  */
 859       val = make_uninit_string (nbytes + byte8_count * 3);
 860     }
 861
 862   src = SDATA (string);
 863   src_end = src + nbytes;
 864   dst = SDATA (val);
 865   if (multibyte)
 866     while (src < src_end)
 867       {
 868         c = *src;
 869         len = BYTES_BY_CHAR_HEAD (c);
 870
 871         if (CHAR_BYTE8_HEAD_P (c))
 872           {
 873             c = STRING_CHAR_ADVANCE (src);
 874             c = CHAR_TO_BYTE8 (c);
 875             dst += sprintf ((char *) dst, "\\%03o", c);
 876           }
 877         else
 878           while (len--) *dst++ = *src++;
 879       }
 880   else
 881     while (src < src_end)
 882       {
 883         c = *src++;
 884         if (c >= 0x80)
 885           dst += sprintf ((char *) dst, "\\%03o", c);
 886         else
 887           *dst++ = c;
 888       }
 889   return val;
 890 }
 891
 892 \f
 893 DEFUN ("string", Fstring, Sstring, 0, MANY, 0,
 894        doc: /*
 895 Concatenate all the argument characters and make the result a string.
 896 usage: (string &rest CHARACTERS)  */)
 897   (ptrdiff_t n, Lisp_Object *args)
 898 {
 899   ptrdiff_t i;
 900   int c;
 901   unsigned char *buf, *p;
 902   Lisp_Object str;
 903   USE_SAFE_ALLOCA;
 904
 905   SAFE_NALLOCA (buf, MAX_MULTIBYTE_LENGTH, n);
 906   p = buf;
 907
 908   for (i = 0; i < n; i++)
 909     {
 910       CHECK_CHARACTER (args[i]);
 911       c = XINT (args[i]);
 912       p += CHAR_STRING (c, p);
 913     }
 914
 915   str = make_string_from_bytes ((char *) buf, n, p - buf);
 916   SAFE_FREE ();
 917   return str;
 918 }
 919
 920 DEFUN ("unibyte-string", Funibyte_string, Sunibyte_string, 0, MANY, 0,
 921        doc: /* Concatenate all the argument bytes and make the result a unibyte string.
 922 usage: (unibyte-string &rest BYTES)  */)
 923   (ptrdiff_t n, Lisp_Object *args)
 924 {
 925   ptrdiff_t i;
 926   Lisp_Object str;
 927   USE_SAFE_ALLOCA;
 928   unsigned char *buf = SAFE_ALLOCA (n);
 929   unsigned char *p = buf;
 930
 931   for (i = 0; i < n; i++)
 932     {
 933       CHECK_RANGED_INTEGER (args[i], 0, 255);
 934       *p++ = XINT (args[i]);
 935     }
 936
 937   str = make_string_from_bytes ((char *) buf, n, p - buf);
 938   SAFE_FREE ();
 939   return str;
 940 }
 941
 942 DEFUN ("char-resolve-modifiers", Fchar_resolve_modifiers,
 943        Schar_resolve_modifiers, 1, 1, 0,
 944        doc: /* Resolve modifiers in the character CHAR.
 945 The value is a character with modifiers resolved into the character
 946 code.  Unresolved modifiers are kept in the value.
 947 usage: (char-resolve-modifiers CHAR)  */)
 948   (Lisp_Object character)
 949 {
 950   EMACS_INT c;
 951
 952   CHECK_NUMBER (character);
 953   c = XINT (character);
 954   return make_number (char_resolve_modifier_mask (c));
 955 }
 956
 957 DEFUN ("get-byte", Fget_byte, Sget_byte, 0, 2, 0,
 958        doc: /* Return a byte value of a character at point.
 959 Optional 1st arg POSITION, if non-nil, is a position of a character to get
 960 a byte value.
 961 Optional 2nd arg STRING, if non-nil, is a string of which first
 962 character is a target to get a byte value.  In this case, POSITION, if
 963 non-nil, is an index of a target character in the string.
 964
 965 If the current buffer (or STRING) is multibyte, and the target
 966 character is not ASCII nor 8-bit character, an error is signaled.  */)
 967   (Lisp_Object position, Lisp_Object string)
 968 {
 969   int c;
 970   ptrdiff_t pos;
 971   unsigned char *p;
 972
 973   if (NILP (string))
 974     {
 975       if (NILP (position))
 976         {
 977           p = PT_ADDR;
 978         }
 979       else
 980         {
 981           CHECK_NUMBER_COERCE_MARKER (position);
 982           if (XINT (position) < BEGV || XINT (position) >= ZV)
 983             args_out_of_range_3 (position, make_number (BEGV), make_number (ZV));
 984           pos = XFASTINT (position);
 985           p = CHAR_POS_ADDR (pos);
 986         }
 987       if (NILP (BVAR (current_buffer, enable_multibyte_characters)))
 988         return make_number (*p);
 989     }
 990   else
 991     {
 992       CHECK_STRING (string);
 993       if (NILP (position))
 994         {
 995           p = SDATA (string);
 996         }
 997       else
 998         {
 999           CHECK_NATNUM (position);
1000           if (XINT (position) >= SCHARS (string))
1001             args_out_of_range (string, position);
1002           pos = XFASTINT (position);
1003           p = SDATA (string) + string_char_to_byte (string, pos);
1004         }
1005       if (! STRING_MULTIBYTE (string))
1006         return make_number (*p);
1007     }
1008   c = STRING_CHAR (p);
1009   if (CHAR_BYTE8_P (c))
1010     c = CHAR_TO_BYTE8 (c);
1011   else if (! ASCII_CHAR_P (c))
1012     error ("Not an ASCII nor an 8-bit character: %d", c);
1013   return make_number (c);
1014 }
1015
1016 #ifdef emacs
1017
1018 void
1019 syms_of_character (void)
1020 {
1021   DEFSYM (Qcharacterp, "characterp");
1022   DEFSYM (Qauto_fill_chars, "auto-fill-chars");
1023
1024   staticpro (&Vchar_unify_table);
1025   Vchar_unify_table = Qnil;
1026
1027   defsubr (&Smax_char);
1028   defsubr (&Scharacterp);
1029   defsubr (&Sunibyte_char_to_multibyte);
1030   defsubr (&Smultibyte_char_to_unibyte);
1031   defsubr (&Schar_width);
1032   defsubr (&Sstring_width);
1033   defsubr (&Sstring);
1034   defsubr (&Sunibyte_string);
1035   defsubr (&Schar_resolve_modifiers);
1036   defsubr (&Sget_byte);
1037
1038   DEFVAR_LISP ("translation-table-vector",  Vtranslation_table_vector,
1039                doc: /*
1040 Vector recording all translation tables ever defined.
1041 Each element is a pair (SYMBOL . TABLE) relating the table to the
1042 symbol naming it.  The ID of a translation table is an index into this vector.  */);
1043   Vtranslation_table_vector = Fmake_vector (make_number (16), Qnil);
1044
1045   DEFVAR_LISP ("auto-fill-chars", Vauto_fill_chars,
1046                doc: /*
1047 A char-table for characters which invoke auto-filling.
1048 Such characters have value t in this table.  */);
1049   Vauto_fill_chars = Fmake_char_table (Qauto_fill_chars, Qnil);
1050   CHAR_TABLE_SET (Vauto_fill_chars, ' ', Qt);
1051   CHAR_TABLE_SET (Vauto_fill_chars, '\n', Qt);
1052
1053   DEFVAR_LISP ("char-width-table", Vchar_width_table,
1054                doc: /*
1055 A char-table for width (columns) of each character.  */);
1056   Vchar_width_table = Fmake_char_table (Qnil, make_number (1));
1057   char_table_set_range (Vchar_width_table, 0x80, 0x9F, make_number (4));
1058   char_table_set_range (Vchar_width_table, MAX_5_BYTE_CHAR + 1, MAX_CHAR,
1059                         make_number (4));
1060
1061   DEFVAR_LISP ("printable-chars", Vprintable_chars,
1062                doc: /* A char-table for each printable character.  */);
1063   Vprintable_chars = Fmake_char_table (Qnil, Qnil);
1064   Fset_char_table_range (Vprintable_chars,
1065                          Fcons (make_number (32), make_number (126)), Qt);
1066   Fset_char_table_range (Vprintable_chars,
1067                          Fcons (make_number (160),
1068                                 make_number (MAX_5_BYTE_CHAR)), Qt);
1069
1070   DEFVAR_LISP ("char-script-table", Vchar_script_table,
1071                doc: /* Char table of script symbols.
1072 It has one extra slot whose value is a list of script symbols.  */);
1073
1074   /* Intern this now in case it isn't already done.
1075      Setting this variable twice is harmless.
1076      But don't staticpro it here--that is done in alloc.c.  */
1077   Qchar_table_extra_slots = intern_c_string ("char-table-extra-slots");
1078   DEFSYM (Qchar_script_table, "char-script-table");
1079   Fput (Qchar_script_table, Qchar_table_extra_slots, make_number (1));
1080   Vchar_script_table = Fmake_char_table (Qchar_script_table, Qnil);
1081
1082   DEFVAR_LISP ("script-representative-chars", Vscript_representative_chars,
1083                doc: /* Alist of scripts vs the representative characters.
1084 Each element is a cons (SCRIPT . CHARS).
1085 SCRIPT is a symbol representing a script or a subgroup of a script.
1086 CHARS is a list or a vector of characters.
1087 If it is a list, all characters in the list are necessary for supporting SCRIPT.
1088 If it is a vector, one of the characters in the vector is necessary.
1089 This variable is used to find a font for a specific script.  */);
1090   Vscript_representative_chars = Qnil;
1091
1092   DEFVAR_LISP ("unicode-category-table", Vunicode_category_table,
1093                doc: /* Char table of Unicode's "General Category".
1094 All Unicode characters have one of the following values (symbol):
1095   Lu, Ll, Lt, Lm, Lo, Mn, Mc, Me, Nd, Nl, No, Pc, Pd, Ps, Pe, Pi, Pf, Po,
1096   Sm, Sc, Sk, So, Zs, Zl, Zp, Cc, Cf, Cs, Co, Cn
1097 See The Unicode Standard for the meaning of those values.  */);
1098   /* The correct char-table is setup in characters.el.  */
1099   Vunicode_category_table = Qnil;
1100 }
1101
1102 #endif /* emacs */