libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32 #include <c-strcase.h>
  33
  34 #include "striconveh.h"
  35
  36 #include "libguile/_scm.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/root.h"
  39 #include "libguile/strings.h"
  40 #include "libguile/ports.h"
  41 #include "libguile/ports-internal.h"
  42 #include "libguile/error.h"
  43 #include "libguile/generalized-vectors.h"
  44 #include "libguile/deprecation.h"
  45 #include "libguile/validate.h"
  46 #include "libguile/private-options.h"
  47
  48 \f
  49
  50 /* {Strings}
  51  */
  52
  53
  54 /* Stringbufs
  55  *
  56  * XXX - keeping an accurate refcount during GC seems to be quite
  57  * tricky, so we just keep score of whether a stringbuf might be
  58  * shared, not whether it definitely is.
  59  *
  60  * The scheme I (mvo) tried to keep an accurate reference count would
  61  * recount all strings that point to a stringbuf during the mark-phase
  62  * of the GC.  This was done since one cannot access the stringbuf of
  63  * a string when that string is freed (in order to decrease the
  64  * reference count).  The memory of the stringbuf might have been
  65  * reused already for something completely different.
  66  *
  67  * This recounted worked for a small number of threads beating on
  68  * cow-strings, but it failed randomly with more than 10 threads, say.
  69  * I couldn't figure out what went wrong, so I used the conservative
  70  * approach implemented below.
  71  *
  72  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  73  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  74  * strings.
  75  */
  76
  77 /* The size in words of the stringbuf header (type tag + size).  */
  78 #define STRINGBUF_HEADER_SIZE   2U
  79
  80 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  81
  82 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  83 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  84
  85 #define STRINGBUF_TAG           scm_tc7_stringbuf
  86 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  87 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  88
  89 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  90                                  SCM_CELL_OBJECT_LOC (buf,              \
  91                                                       STRINGBUF_HEADER_SIZE))
  92 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  94
  95 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  96
  97 #define SET_STRINGBUF_SHARED(buf)                                       \
  98   do                                                                    \
  99     {                                                                   \
 100       /* Don't modify BUF if it's already marked as shared since it might be \
 101          a read-only, statically allocated stringbuf.  */               \
 102       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 103         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 104     }                                                                   \
 105   while (0)
 106
 107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 108 static size_t lenhist[1001];
 109 #endif
 110
 111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 112    characters. */
 113 static SCM
 114 make_stringbuf (size_t len)
 115 {
 116   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 117      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 118      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 119      has been changed for scm_i_symbol_chars, this null-termination
 120      can be dropped.
 121   */
 122
 123   SCM buf;
 124
 125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 126   if (len < 1000)
 127     lenhist[len]++;
 128   else
 129     lenhist[1000]++;
 130 #endif
 131
 132   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 133                                             "string"));
 134
 135   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 136   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 137
 138   STRINGBUF_CHARS (buf)[len] = 0;
 139
 140   return buf;
 141 }
 142
 143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 144    characters.  */
 145 static SCM
 146 make_wide_stringbuf (size_t len)
 147 {
 148   SCM buf;
 149   size_t raw_len;
 150
 151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 152   if (len < 1000)
 153     lenhist[len]++;
 154   else
 155     lenhist[1000]++;
 156 #endif
 157
 158   raw_len = (len + 1) * sizeof (scm_t_wchar);
 159   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 160                                             "string"));
 161
 162   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 163   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 164
 165   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 166
 167   return buf;
 168 }
 169
 170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 171    characters from BUF.  */
 172 static SCM
 173 wide_stringbuf (SCM buf)
 174 {
 175   SCM new_buf;
 176
 177   if (STRINGBUF_WIDE (buf))
 178     new_buf = buf;
 179   else
 180     {
 181       size_t i, len;
 182       scm_t_wchar *mem;
 183
 184       len = STRINGBUF_LENGTH (buf);
 185
 186       new_buf = make_wide_stringbuf (len);
 187
 188       mem = STRINGBUF_WIDE_CHARS (new_buf);
 189       for (i = 0; i < len; i++)
 190         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 191       mem[len] = 0;
 192     }
 193
 194   return new_buf;
 195 }
 196
 197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 198    characters from BUF, if possible.  */
 199 static SCM
 200 narrow_stringbuf (SCM buf)
 201 {
 202   SCM new_buf;
 203
 204   if (!STRINGBUF_WIDE (buf))
 205     new_buf = buf;
 206   else
 207     {
 208       size_t i, len;
 209       scm_t_wchar *wmem;
 210       unsigned char *mem;
 211
 212       len = STRINGBUF_LENGTH (buf);
 213       wmem = STRINGBUF_WIDE_CHARS (buf);
 214
 215       for (i = 0; i < len; i++)
 216         if (wmem[i] > 0xFF)
 217           /* BUF cannot be narrowed.  */
 218           return buf;
 219
 220       new_buf = make_stringbuf (len);
 221
 222       mem = STRINGBUF_CHARS (new_buf);
 223       for (i = 0; i < len; i++)
 224         mem[i] = (unsigned char) wmem[i];
 225       mem[len] = 0;
 226     }
 227
 228   return new_buf;
 229 }
 230
 231 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 232
 233 \f
 234 /* Copy-on-write strings.
 235  */
 236
 237 #define STRING_TAG            scm_tc7_string
 238
 239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 240 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 241 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 242
 243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 245
 246 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 247
 248 /* Read-only strings.
 249  */
 250
 251 #define RO_STRING_TAG         scm_tc7_ro_string
 252 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 253
 254 /* Mutation-sharing substrings
 255  */
 256
 257 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 258
 259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 260 /* START and LENGTH as for STRINGs. */
 261
 262 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 263
 264 SCM scm_nullstr;
 265
 266 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 267    characters.  CHARSP, if not NULL, will be set to location of the
 268    char array.  If READ_ONLY_P, the returned string is read-only;
 269    otherwise it is writable.  */
 270 SCM
 271 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 272 {
 273   static SCM null_stringbuf = SCM_BOOL_F;
 274   SCM buf;
 275   SCM res;
 276
 277   if (len == 0)
 278     {
 279       if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
 280         {
 281           null_stringbuf = make_stringbuf (0);
 282           SET_STRINGBUF_SHARED (null_stringbuf);
 283         }
 284       buf = null_stringbuf;
 285     }
 286   else
 287     buf = make_stringbuf (len);
 288
 289   if (charsp)
 290     *charsp = (char *) STRINGBUF_CHARS (buf);
 291   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 292                          SCM_UNPACK (buf),
 293                          (scm_t_bits) 0, (scm_t_bits) len);
 294   return res;
 295 }
 296
 297 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 298    characters.  CHARSP, if not NULL, will be set to location of the
 299    character array.  If READ_ONLY_P, the returned string is read-only;
 300    otherwise it is writable.  */
 301 SCM
 302 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 303 {
 304   SCM buf = make_wide_stringbuf (len);
 305   SCM res;
 306   if (charsp)
 307     *charsp = STRINGBUF_WIDE_CHARS (buf);
 308   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 309                          SCM_UNPACK (buf),
 310                          (scm_t_bits) 0, (scm_t_bits) len);
 311   return res;
 312 }
 313
 314 static void
 315 validate_substring_args (SCM str, size_t start, size_t end)
 316 {
 317   if (!IS_STRING (str))
 318     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 319   if (start > STRING_LENGTH (str))
 320     scm_out_of_range (NULL, scm_from_size_t (start));
 321   if (end > STRING_LENGTH (str) || end < start)
 322     scm_out_of_range (NULL, scm_from_size_t (end));
 323 }
 324
 325 static inline void
 326 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 327 {
 328   *start = STRING_START (*str);
 329   if (IS_SH_STRING (*str))
 330     {
 331       *str = SH_STRING_STRING (*str);
 332       *start += STRING_START (*str);
 333     }
 334   *buf = STRING_STRINGBUF (*str);
 335 }
 336
 337 SCM
 338 scm_i_substring (SCM str, size_t start, size_t end)
 339 {
 340   if (start == end)
 341     return scm_i_make_string (0, NULL, 0);
 342   else
 343     {
 344       SCM buf;
 345       size_t str_start;
 346       get_str_buf_start (&str, &buf, &str_start);
 347       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 348       SET_STRINGBUF_SHARED (buf);
 349       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 350       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 351                               (scm_t_bits)str_start + start,
 352                               (scm_t_bits) end - start);
 353     }
 354 }
 355
 356 SCM
 357 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 358 {
 359   if (start == end)
 360     return scm_i_make_string (0, NULL, 1);
 361   else
 362     {
 363       SCM buf;
 364       size_t str_start;
 365       get_str_buf_start (&str, &buf, &str_start);
 366       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 367       SET_STRINGBUF_SHARED (buf);
 368       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 369       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 370                               (scm_t_bits)str_start + start,
 371                               (scm_t_bits) end - start);
 372     }
 373 }
 374
 375 SCM
 376 scm_i_substring_copy (SCM str, size_t start, size_t end)
 377 {
 378   if (start == end)
 379     return scm_i_make_string (0, NULL, 0);
 380   else
 381     {
 382       size_t len = end - start;
 383       SCM buf, my_buf, substr;
 384       size_t str_start;
 385       int wide = 0;
 386       get_str_buf_start (&str, &buf, &str_start);
 387       if (scm_i_is_narrow_string (str))
 388         {
 389           my_buf = make_stringbuf (len);
 390           memcpy (STRINGBUF_CHARS (my_buf),
 391                   STRINGBUF_CHARS (buf) + str_start + start, len);
 392         }
 393       else
 394         {
 395           my_buf = make_wide_stringbuf (len);
 396           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 397                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 398                                      + start), len);
 399           wide = 1;
 400         }
 401       scm_remember_upto_here_1 (buf);
 402       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 403                                 (scm_t_bits) 0, (scm_t_bits) len);
 404       if (wide)
 405         scm_i_try_narrow_string (substr);
 406       return substr;
 407     }
 408 }
 409
 410 SCM
 411 scm_i_substring_shared (SCM str, size_t start, size_t end)
 412 {
 413   if (start == 0 && end == STRING_LENGTH (str))
 414     return str;
 415   else if (start == end)
 416     return scm_i_make_string (0, NULL, 0);
 417   else
 418     {
 419       size_t len = end - start;
 420       if (IS_SH_STRING (str))
 421         {
 422           start += STRING_START (str);
 423           str = SH_STRING_STRING (str);
 424         }
 425       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 426                               (scm_t_bits)start, (scm_t_bits) len);
 427     }
 428 }
 429
 430 SCM
 431 scm_c_substring (SCM str, size_t start, size_t end)
 432 {
 433   validate_substring_args (str, start, end);
 434   return scm_i_substring (str, start, end);
 435 }
 436
 437 SCM
 438 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 439 {
 440   validate_substring_args (str, start, end);
 441   return scm_i_substring_read_only (str, start, end);
 442 }
 443
 444 SCM
 445 scm_c_substring_copy (SCM str, size_t start, size_t end)
 446 {
 447   validate_substring_args (str, start, end);
 448   return scm_i_substring_copy (str, start, end);
 449 }
 450
 451 SCM
 452 scm_c_substring_shared (SCM str, size_t start, size_t end)
 453 {
 454   validate_substring_args (str, start, end);
 455   return scm_i_substring_shared (str, start, end);
 456 }
 457
 458 \f
 459 /* Internal accessors
 460  */
 461
 462 /* Returns the number of characters in STR.  This may be different
 463    than the memory size of the string storage.  */
 464 size_t
 465 scm_i_string_length (SCM str)
 466 {
 467   return STRING_LENGTH (str);
 468 }
 469
 470 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 471    encoding.  False if it is 'wide', having a 32-bit UCS-4
 472    encoding.  */
 473 int
 474 scm_i_is_narrow_string (SCM str)
 475 {
 476   if (IS_SH_STRING (str))
 477     str = SH_STRING_STRING (str);
 478
 479   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 480 }
 481
 482 /* Try to coerce a string to be narrow.  It if is narrow already, do
 483    nothing.  If it is wide, shrink it to narrow if none of its
 484    characters are above 0xFF.  Return true if the string is narrow or
 485    was made to be narrow.  */
 486 int
 487 scm_i_try_narrow_string (SCM str)
 488 {
 489   if (IS_SH_STRING (str))
 490     str = SH_STRING_STRING (str);
 491
 492   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 493
 494   return scm_i_is_narrow_string (str);
 495 }
 496
 497 /* Return a pointer to the raw data of the string, which can be either Latin-1
 498    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 499 const void *
 500 scm_i_string_data (SCM str)
 501 {
 502   SCM buf;
 503   size_t start;
 504   const char *data;
 505
 506   get_str_buf_start (&str, &buf, &start);
 507
 508   data = STRINGBUF_CONTENTS (buf);
 509   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 510
 511   return data;
 512 }
 513
 514 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 515    STR.  */
 516 const char *
 517 scm_i_string_chars (SCM str)
 518 {
 519   SCM buf;
 520   size_t start;
 521   get_str_buf_start (&str, &buf, &start);
 522   if (scm_i_is_narrow_string (str))
 523     return (const char *) STRINGBUF_CHARS (buf) + start;
 524   else
 525     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 526                     scm_list_1 (str));
 527   return NULL;
 528 }
 529
 530 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 531    STR.  */
 532 const scm_t_wchar *
 533 scm_i_string_wide_chars (SCM str)
 534 {
 535   SCM buf;
 536   size_t start;
 537
 538   get_str_buf_start (&str, &buf, &start);
 539   if (!scm_i_is_narrow_string (str))
 540     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 541   else
 542     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 543                     scm_list_1 (str));
 544 }
 545
 546 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 547    a new string buffer, so that it can be modified without modifying
 548    other strings.  Also, lock the string mutex.  Later, one must call
 549    scm_i_string_stop_writing to unlock the mutex.  */
 550 SCM
 551 scm_i_string_start_writing (SCM orig_str)
 552 {
 553   SCM buf, str = orig_str;
 554   size_t start;
 555
 556   get_str_buf_start (&str, &buf, &start);
 557   if (IS_RO_STRING (str))
 558     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 559
 560   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 561   if (STRINGBUF_SHARED (buf))
 562     {
 563       /* Clone the stringbuf.  */
 564       size_t len = STRING_LENGTH (str);
 565       SCM new_buf;
 566
 567       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 568
 569       if (scm_i_is_narrow_string (str))
 570         {
 571           new_buf = make_stringbuf (len);
 572           memcpy (STRINGBUF_CHARS (new_buf),
 573                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 574
 575         }
 576       else
 577         {
 578           new_buf = make_wide_stringbuf (len);
 579           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 580                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 581                                      + STRING_START (str)), len);
 582         }
 583
 584       SET_STRING_STRINGBUF (str, new_buf);
 585       start -= STRING_START (str);
 586
 587       /* FIXME: The following operations are not atomic, so other threads
 588          looking at STR may see an inconsistent state.  Nevertheless it can't
 589          hurt much since (i) accessing STR while it is being mutated can't
 590          yield a crash, and (ii) concurrent accesses to STR should be
 591          protected by a mutex at the application level.  The latter may not
 592          apply when STR != ORIG_STR, though.  */
 593       SET_STRING_START (str, 0);
 594       SET_STRING_STRINGBUF (str, new_buf);
 595
 596       buf = new_buf;
 597
 598       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 599     }
 600   return orig_str;
 601 }
 602
 603 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 604 char *
 605 scm_i_string_writable_chars (SCM str)
 606 {
 607   SCM buf;
 608   size_t start;
 609
 610   get_str_buf_start (&str, &buf, &start);
 611   if (scm_i_is_narrow_string (str))
 612     return (char *) STRINGBUF_CHARS (buf) + start;
 613   else
 614     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 615                     scm_list_1 (str));
 616   return NULL;
 617 }
 618
 619 /* Return a pointer to the UCS-4 codepoints of a string.  */
 620 static scm_t_wchar *
 621 scm_i_string_writable_wide_chars (SCM str)
 622 {
 623   SCM buf;
 624   size_t start;
 625
 626   get_str_buf_start (&str, &buf, &start);
 627   if (!scm_i_is_narrow_string (str))
 628     return STRINGBUF_WIDE_CHARS (buf) + start;
 629   else
 630     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 631                     scm_list_1 (str));
 632 }
 633
 634 /* Unlock the string mutex that was locked when
 635    scm_i_string_start_writing was called.  */
 636 void
 637 scm_i_string_stop_writing (void)
 638 {
 639   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 640 }
 641
 642 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 643 scm_t_wchar
 644 scm_i_string_ref (SCM str, size_t x)
 645 {
 646   if (scm_i_is_narrow_string (str))
 647     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 648   else
 649     return scm_i_string_wide_chars (str)[x];
 650 }
 651
 652 /* Returns index+1 of the first char in STR that matches C, or
 653    0 if the char is not found.  */
 654 int
 655 scm_i_string_contains_char (SCM str, char ch)
 656 {
 657   size_t i;
 658   size_t len = scm_i_string_length (str);
 659
 660   i = 0;
 661   if (scm_i_is_narrow_string (str))
 662     {
 663       while (i < len)
 664         {
 665           if (scm_i_string_chars (str)[i] == ch)
 666             return i+1;
 667           i++;
 668         }
 669     }
 670   else
 671     {
 672       while (i < len)
 673         {
 674           if (scm_i_string_wide_chars (str)[i]
 675               == (unsigned char) ch)
 676             return i+1;
 677           i++;
 678         }
 679     }
 680   return 0;
 681 }
 682
 683 int
 684 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 685 {
 686   if (scm_i_is_narrow_string (sstr))
 687     {
 688       const char *a = scm_i_string_chars (sstr) + start_x;
 689       const char *b = cstr;
 690       return strncmp (a, b, strlen(b));
 691     }
 692   else
 693     {
 694       size_t i;
 695       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 696       const char *b = cstr;
 697       for (i = 0; i < strlen (b); i++)
 698         {
 699           if (a[i] != (unsigned char) b[i])
 700             return 1;
 701         }
 702     }
 703   return 0;
 704 }
 705
 706 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 707 void
 708 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 709 {
 710   if (IS_SH_STRING (str))
 711     {
 712       p += STRING_START (str);
 713       str = SH_STRING_STRING (str);
 714     }
 715
 716   if (chr > 0xFF && scm_i_is_narrow_string (str))
 717     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 718
 719   if (scm_i_is_narrow_string (str))
 720     {
 721       char *dst = scm_i_string_writable_chars (str);
 722       dst[p] = chr;
 723     }
 724   else
 725     {
 726       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 727       dst[p] = chr;
 728     }
 729 }
 730
 731 \f
 732 /* Symbols.
 733
 734    Basic symbol creation and accessing is done here, the rest is in
 735    symbols.[hc].  This has been done to keep stringbufs and the
 736    internals of strings and string-like objects confined to this file.
 737 */
 738
 739 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 740
 741 SCM
 742 scm_i_make_symbol (SCM name, scm_t_bits flags,
 743                    unsigned long hash, SCM props)
 744 {
 745   SCM buf;
 746   size_t start = STRING_START (name);
 747   size_t length = STRING_LENGTH (name);
 748
 749   if (IS_SH_STRING (name))
 750     {
 751       name = SH_STRING_STRING (name);
 752       start += STRING_START (name);
 753     }
 754   buf = STRING_STRINGBUF (name);
 755
 756   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 757     {
 758       /* reuse buf. */
 759       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 760       SET_STRINGBUF_SHARED (buf);
 761       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 762     }
 763   else
 764     {
 765       /* make new buf. */
 766       if (scm_i_is_narrow_string (name))
 767         {
 768           SCM new_buf = make_stringbuf (length);
 769           memcpy (STRINGBUF_CHARS (new_buf),
 770                   STRINGBUF_CHARS (buf) + start, length);
 771           buf = new_buf;
 772         }
 773       else
 774         {
 775           SCM new_buf = make_wide_stringbuf (length);
 776           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 777                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 778                    length);
 779           buf = new_buf;
 780         }
 781     }
 782   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 783                           (scm_t_bits) hash, SCM_UNPACK (props));
 784 }
 785
 786 SCM
 787 scm_i_c_make_symbol (const char *name, size_t len,
 788                      scm_t_bits flags, unsigned long hash, SCM props)
 789 {
 790   SCM buf = make_stringbuf (len);
 791   memcpy (STRINGBUF_CHARS (buf), name, len);
 792
 793   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 794                           (scm_t_bits) hash, SCM_UNPACK (props));
 795 }
 796
 797 /* Returns the number of characters in SYM.  This may be different
 798    from the memory size of SYM.  */
 799 size_t
 800 scm_i_symbol_length (SCM sym)
 801 {
 802   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 803 }
 804
 805 size_t
 806 scm_c_symbol_length (SCM sym)
 807 #define FUNC_NAME "scm_c_symbol_length"
 808 {
 809   SCM_VALIDATE_SYMBOL (1, sym);
 810
 811   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 812 }
 813 #undef FUNC_NAME
 814
 815 /* True if the name of SYM is stored as a Latin-1 encoded string.
 816    False if it is stored as a 32-bit UCS-4-encoded string.  */
 817 int
 818 scm_i_is_narrow_symbol (SCM sym)
 819 {
 820   SCM buf;
 821
 822   buf = SYMBOL_STRINGBUF (sym);
 823   return !STRINGBUF_WIDE (buf);
 824 }
 825
 826 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 827    contains the name of SYM.  */
 828 const char *
 829 scm_i_symbol_chars (SCM sym)
 830 {
 831   SCM buf;
 832
 833   buf = SYMBOL_STRINGBUF (sym);
 834   if (!STRINGBUF_WIDE (buf))
 835     return (const char *) STRINGBUF_CHARS (buf);
 836   else
 837     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 838                     scm_list_1 (sym));
 839 }
 840
 841 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 842    symbol's name.  */
 843 const scm_t_wchar *
 844 scm_i_symbol_wide_chars (SCM sym)
 845 {
 846   SCM buf;
 847
 848   buf = SYMBOL_STRINGBUF (sym);
 849   if (STRINGBUF_WIDE (buf))
 850     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 851   else
 852     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 853                     scm_list_1 (sym));
 854 }
 855
 856 SCM
 857 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 858 {
 859   SCM buf = SYMBOL_STRINGBUF (sym);
 860   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 861   SET_STRINGBUF_SHARED (buf);
 862   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 863   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 864                           (scm_t_bits)start, (scm_t_bits) end - start);
 865 }
 866
 867 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 868 scm_t_wchar
 869 scm_i_symbol_ref (SCM sym, size_t x)
 870 {
 871   if (scm_i_is_narrow_symbol (sym))
 872     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 873   else
 874     return scm_i_symbol_wide_chars (sym)[x];
 875 }
 876
 877 /* Debugging
 878  */
 879
 880 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 881             "Returns an association list containing debugging information\n"
 882             "for @var{str}. The association list has the following entries."
 883             "@table @code\n"
 884             "@item string\n"
 885             "The string itself.\n"
 886             "@item start\n"
 887             "The start index of the string into its stringbuf\n"
 888             "@item length\n"
 889             "The length of the string\n"
 890             "@item shared\n"
 891             "If this string is a substring, it returns its parent string.\n"
 892             "Otherwise, it returns @code{#f}\n"
 893             "@item read-only\n"
 894             "@code{#t} if the string is read-only\n"
 895             "@item stringbuf-chars\n"
 896             "A new string containing this string's stringbuf's characters\n"
 897             "@item stringbuf-length\n"
 898             "The number of characters in this stringbuf\n"
 899             "@item stringbuf-shared\n"
 900             "@code{#t} if this stringbuf is shared\n"
 901             "@item stringbuf-wide\n"
 902             "@code{#t} if this stringbuf's characters are stored in a\n"
 903             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 904             "buffer\n"
 905             "@end table")
 906 #define FUNC_NAME s_scm_sys_string_dump
 907 {
 908   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 909   SCM buf;
 910   SCM_VALIDATE_STRING (1, str);
 911
 912   /* String info */
 913   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 914                  str);
 915   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 916                  scm_from_size_t (STRING_START (str)));
 917   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 918                  scm_from_size_t (STRING_LENGTH (str)));
 919
 920   if (IS_SH_STRING (str))
 921     {
 922       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 923                      SH_STRING_STRING (str));
 924       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 925     }
 926   else
 927     {
 928       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 929                      SCM_BOOL_F);
 930       buf = STRING_STRINGBUF (str);
 931     }
 932
 933   if (IS_RO_STRING (str))
 934     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 935                    SCM_BOOL_T);
 936   else
 937     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 938                    SCM_BOOL_F);
 939
 940   /* Stringbuf info */
 941   if (!STRINGBUF_WIDE (buf))
 942     {
 943       size_t len = STRINGBUF_LENGTH (buf);
 944       char *cbuf;
 945       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 946       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 947       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 948                      sbc);
 949     }
 950   else
 951     {
 952       size_t len = STRINGBUF_LENGTH (buf);
 953       scm_t_wchar *cbuf;
 954       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 955       u32_cpy ((scm_t_uint32 *) cbuf,
 956                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 957       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 958                      sbc);
 959     }
 960   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 961                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 962   if (STRINGBUF_SHARED (buf))
 963     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 964                    SCM_BOOL_T);
 965   else
 966     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 967                    SCM_BOOL_F);
 968   if (STRINGBUF_WIDE (buf))
 969     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 970                    SCM_BOOL_T);
 971   else
 972     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 973                    SCM_BOOL_F);
 974
 975   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 976 }
 977 #undef FUNC_NAME
 978
 979 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 980             "Returns an association list containing debugging information\n"
 981             "for @var{sym}. The association list has the following entries."
 982             "@table @code\n"
 983             "@item symbol\n"
 984             "The symbol itself\n"
 985             "@item hash\n"
 986             "Its hash value\n"
 987             "@item interned\n"
 988             "@code{#t} if it is an interned symbol\n"
 989             "@item stringbuf-chars\n"
 990             "A new string containing this symbols's stringbuf's characters\n"
 991             "@item stringbuf-length\n"
 992             "The number of characters in this stringbuf\n"
 993             "@item stringbuf-shared\n"
 994             "@code{#t} if this stringbuf is shared\n"
 995             "@item stringbuf-wide\n"
 996             "@code{#t} if this stringbuf's characters are stored in a\n"
 997             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 998             "buffer\n"
 999             "@end table")
1000 #define FUNC_NAME s_scm_sys_symbol_dump
1001 {
1002   SCM e1, e2, e3, e4, e5, e6, e7;
1003   SCM buf;
1004   SCM_VALIDATE_SYMBOL (1, sym);
1005   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1006                  sym);
1007   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1008                  scm_from_ulong (scm_i_symbol_hash (sym)));
1009   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1010                  scm_symbol_interned_p (sym));
1011   buf = SYMBOL_STRINGBUF (sym);
1012
1013   /* Stringbuf info */
1014   if (!STRINGBUF_WIDE (buf))
1015     {
1016       size_t len = STRINGBUF_LENGTH (buf);
1017       char *cbuf;
1018       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1019       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1020       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1021                      sbc);
1022     }
1023   else
1024     {
1025       size_t len = STRINGBUF_LENGTH (buf);
1026       scm_t_wchar *cbuf;
1027       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1028       u32_cpy ((scm_t_uint32 *) cbuf,
1029                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1030       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1031                      sbc);
1032     }
1033   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1034                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1035   if (STRINGBUF_SHARED (buf))
1036     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1037                    SCM_BOOL_T);
1038   else
1039     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1040                    SCM_BOOL_F);
1041   if (STRINGBUF_WIDE (buf))
1042     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1043                     SCM_BOOL_T);
1044   else
1045     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1046                     SCM_BOOL_F);
1047   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1048
1049 }
1050 #undef FUNC_NAME
1051
1052 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1053
1054 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1055 #define FUNC_NAME s_scm_sys_stringbuf_hist
1056 {
1057   int i;
1058   for (i = 0; i < 1000; i++)
1059     if (lenhist[i])
1060       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1061   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1062   return SCM_UNSPECIFIED;
1063 }
1064 #undef FUNC_NAME
1065
1066 #endif
1067
1068 \f
1069
1070 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1071             (SCM obj),
1072             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1073 #define FUNC_NAME s_scm_string_p
1074 {
1075   return scm_from_bool (IS_STRING (obj));
1076 }
1077 #undef FUNC_NAME
1078
1079
1080 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1081
1082 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1083             (SCM chrs),
1084             "@deffnx {Scheme Procedure} list->string chrs\n"
1085             "Return a newly allocated string composed of the arguments,\n"
1086             "@var{chrs}.")
1087 #define FUNC_NAME s_scm_string
1088 {
1089   SCM result = SCM_BOOL_F;
1090   SCM rest;
1091   size_t len;
1092   size_t p = 0;
1093   long i;
1094   int wide = 0;
1095
1096   /* Verify that this is a list of chars.  */
1097   i = scm_ilength (chrs);
1098   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1099
1100   len = (size_t) i;
1101   rest = chrs;
1102
1103   while (len > 0 && scm_is_pair (rest))
1104     {
1105       SCM elt = SCM_CAR (rest);
1106       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1107       if (SCM_CHAR (elt) > 0xFF)
1108         wide = 1;
1109       rest = SCM_CDR (rest);
1110       len--;
1111       scm_remember_upto_here_1 (elt);
1112     }
1113
1114   /* Construct a string containing this list of chars.  */
1115   len = (size_t) i;
1116   rest = chrs;
1117
1118   if (wide == 0)
1119     {
1120       char *buf;
1121
1122       result = scm_i_make_string (len, NULL, 0);
1123       result = scm_i_string_start_writing (result);
1124       buf = scm_i_string_writable_chars (result);
1125       while (len > 0 && scm_is_pair (rest))
1126         {
1127           SCM elt = SCM_CAR (rest);
1128           buf[p] = (unsigned char) SCM_CHAR (elt);
1129           p++;
1130           rest = SCM_CDR (rest);
1131           len--;
1132           scm_remember_upto_here_1 (elt);
1133         }
1134     }
1135   else
1136     {
1137       scm_t_wchar *buf;
1138
1139       result = scm_i_make_wide_string (len, NULL, 0);
1140       result = scm_i_string_start_writing (result);
1141       buf = scm_i_string_writable_wide_chars (result);
1142       while (len > 0 && scm_is_pair (rest))
1143         {
1144           SCM elt = SCM_CAR (rest);
1145           buf[p] = SCM_CHAR (elt);
1146           p++;
1147           rest = SCM_CDR (rest);
1148           len--;
1149           scm_remember_upto_here_1 (elt);
1150         }
1151     }
1152   scm_i_string_stop_writing ();
1153
1154   if (len > 0)
1155     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1156   if (!scm_is_null (rest))
1157     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1158
1159   return result;
1160 }
1161 #undef FUNC_NAME
1162
1163 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1164             (SCM k, SCM chr),
1165             "Return a newly allocated string of\n"
1166             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1167             "the string are initialized to @var{chr}, otherwise the contents\n"
1168             "of the string are all set to @code{#\nul}.")
1169 #define FUNC_NAME s_scm_make_string
1170 {
1171   return scm_c_make_string (scm_to_size_t (k), chr);
1172 }
1173 #undef FUNC_NAME
1174
1175 SCM
1176 scm_c_make_string (size_t len, SCM chr)
1177 #define FUNC_NAME NULL
1178 {
1179   size_t p;
1180   char *contents = NULL;
1181   SCM res = scm_i_make_string (len, &contents, 0);
1182
1183   /* If no char is given, initialize string contents to NULL.  */
1184   if (SCM_UNBNDP (chr))
1185     memset (contents, 0, len);
1186   else
1187     {
1188       SCM_VALIDATE_CHAR (0, chr);
1189       res = scm_i_string_start_writing (res);
1190       for (p = 0; p < len; p++)
1191         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1192       scm_i_string_stop_writing ();
1193     }
1194
1195   return res;
1196 }
1197 #undef FUNC_NAME
1198
1199 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1200             (SCM string),
1201             "Return the number of characters in @var{string}.")
1202 #define FUNC_NAME s_scm_string_length
1203 {
1204   SCM_VALIDATE_STRING (1, string);
1205   return scm_from_size_t (STRING_LENGTH (string));
1206 }
1207 #undef FUNC_NAME
1208
1209 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1210             (SCM string),
1211             "Return the bytes used to represent a character in @var{string}."
1212             "This will return 1 or 4.")
1213 #define FUNC_NAME s_scm_string_bytes_per_char
1214 {
1215   SCM_VALIDATE_STRING (1, string);
1216   if (!scm_i_is_narrow_string (string))
1217     return scm_from_int (4);
1218
1219   return scm_from_int (1);
1220 }
1221 #undef FUNC_NAME
1222
1223 size_t
1224 scm_c_string_length (SCM string)
1225 {
1226   if (!IS_STRING (string))
1227     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1228   return STRING_LENGTH (string);
1229 }
1230
1231 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1232             (SCM str, SCM k),
1233             "Return character @var{k} of @var{str} using zero-origin\n"
1234             "indexing. @var{k} must be a valid index of @var{str}.")
1235 #define FUNC_NAME s_scm_string_ref
1236 {
1237   size_t len;
1238   unsigned long idx;
1239
1240   SCM_VALIDATE_STRING (1, str);
1241
1242   len = scm_i_string_length (str);
1243   if (SCM_LIKELY (len > 0))
1244     idx = scm_to_unsigned_integer (k, 0, len - 1);
1245   else
1246     scm_out_of_range (NULL, k);
1247
1248   if (scm_i_is_narrow_string (str))
1249     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1250   else
1251     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1252 }
1253 #undef FUNC_NAME
1254
1255 SCM
1256 scm_c_string_ref (SCM str, size_t p)
1257 {
1258   if (p >= scm_i_string_length (str))
1259     scm_out_of_range (NULL, scm_from_size_t (p));
1260   if (scm_i_is_narrow_string (str))
1261     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1262   else
1263     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1264
1265 }
1266
1267 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1268             (SCM str, SCM k, SCM chr),
1269             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1270             "an unspecified value. @var{k} must be a valid index of\n"
1271             "@var{str}.")
1272 #define FUNC_NAME s_scm_string_set_x
1273 {
1274   size_t len;
1275   unsigned long idx;
1276
1277   SCM_VALIDATE_STRING (1, str);
1278
1279   len = scm_i_string_length (str);
1280   if (SCM_LIKELY (len > 0))
1281     idx = scm_to_unsigned_integer (k, 0, len - 1);
1282   else
1283     scm_out_of_range (NULL, k);
1284
1285   SCM_VALIDATE_CHAR (3, chr);
1286   str = scm_i_string_start_writing (str);
1287   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1288   scm_i_string_stop_writing ();
1289
1290   return SCM_UNSPECIFIED;
1291 }
1292 #undef FUNC_NAME
1293
1294 void
1295 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1296 {
1297   if (p >= scm_i_string_length (str))
1298     scm_out_of_range (NULL, scm_from_size_t (p));
1299   str = scm_i_string_start_writing (str);
1300   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1301   scm_i_string_stop_writing ();
1302 }
1303
1304 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1305             (SCM str, SCM start, SCM end),
1306             "Return a newly allocated string formed from the characters\n"
1307             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308             "ending with index @var{end} (exclusive).\n"
1309             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310             "exact integers satisfying:\n\n"
1311             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring
1313 {
1314   size_t len, from, to;
1315
1316   SCM_VALIDATE_STRING (1, str);
1317   len = scm_i_string_length (str);
1318   from = scm_to_unsigned_integer (start, 0, len);
1319   if (SCM_UNBNDP (end))
1320     to = len;
1321   else
1322     to = scm_to_unsigned_integer (end, from, len);
1323   return scm_i_substring (str, from, to);
1324 }
1325 #undef FUNC_NAME
1326
1327 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1328             (SCM str, SCM start, SCM end),
1329             "Return a newly allocated string formed from the characters\n"
1330             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1331             "ending with index @var{end} (exclusive).\n"
1332             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1333             "exact integers satisfying:\n"
1334             "\n"
1335             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1336             "\n"
1337             "The returned string is read-only.\n")
1338 #define FUNC_NAME s_scm_substring_read_only
1339 {
1340   size_t len, from, to;
1341
1342   SCM_VALIDATE_STRING (1, str);
1343   len = scm_i_string_length (str);
1344   from = scm_to_unsigned_integer (start, 0, len);
1345   if (SCM_UNBNDP (end))
1346     to = len;
1347   else
1348     to = scm_to_unsigned_integer (end, from, len);
1349   return scm_i_substring_read_only (str, from, to);
1350 }
1351 #undef FUNC_NAME
1352
1353 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1354             (SCM str, SCM start, SCM end),
1355             "Return a newly allocated string formed from the characters\n"
1356             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1357             "ending with index @var{end} (exclusive).\n"
1358             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1359             "exact integers satisfying:\n\n"
1360             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1361 #define FUNC_NAME s_scm_substring_copy
1362 {
1363   /* For the Scheme version, START is mandatory, but for the C
1364      version, it is optional.  See scm_string_copy in srfi-13.c for a
1365      rationale.
1366   */
1367
1368   size_t from, to;
1369
1370   SCM_VALIDATE_STRING (1, str);
1371   scm_i_get_substring_spec (scm_i_string_length (str),
1372                             start, &from, end, &to);
1373   return scm_i_substring_copy (str, from, to);
1374 }
1375 #undef FUNC_NAME
1376
1377 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1378             (SCM str, SCM start, SCM end),
1379             "Return string that indirectly refers to the characters\n"
1380             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1381             "ending with index @var{end} (exclusive).\n"
1382             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1383             "exact integers satisfying:\n\n"
1384             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1385 #define FUNC_NAME s_scm_substring_shared
1386 {
1387   size_t len, from, to;
1388
1389   SCM_VALIDATE_STRING (1, str);
1390   len = scm_i_string_length (str);
1391   from = scm_to_unsigned_integer (start, 0, len);
1392   if (SCM_UNBNDP (end))
1393     to = len;
1394   else
1395     to = scm_to_unsigned_integer (end, from, len);
1396   return scm_i_substring_shared (str, from, to);
1397 }
1398 #undef FUNC_NAME
1399
1400 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1401             (SCM args),
1402             "Return a newly allocated string whose characters form the\n"
1403             "concatenation of the given strings, @var{args}.")
1404 #define FUNC_NAME s_scm_string_append
1405 {
1406   SCM res;
1407   size_t total = 0;
1408   size_t len;
1409   int wide = 0;
1410   SCM l, s;
1411   size_t i;
1412   union
1413   {
1414     char *narrow;
1415     scm_t_wchar *wide;
1416   } data;
1417
1418   SCM_VALIDATE_REST_ARGUMENT (args);
1419   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1420     {
1421       s = SCM_CAR (l);
1422       SCM_VALIDATE_STRING (SCM_ARGn, s);
1423       len = scm_i_string_length (s);
1424       if (((size_t) -1) - total < len)
1425         scm_num_overflow (s_scm_string_append);
1426       total += len;
1427       if (!scm_i_is_narrow_string (s))
1428         wide = 1;
1429     }
1430   data.narrow = NULL;
1431   if (!wide)
1432     res = scm_i_make_string (total, &data.narrow, 0);
1433   else
1434     res = scm_i_make_wide_string (total, &data.wide, 0);
1435
1436   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1437     {
1438       size_t len;
1439       s = SCM_CAR (l);
1440       SCM_VALIDATE_STRING (SCM_ARGn, s);
1441       len = scm_i_string_length (s);
1442       if (len > total)
1443         SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1444       if (!wide)
1445         {
1446           memcpy (data.narrow, scm_i_string_chars (s), len);
1447           data.narrow += len;
1448         }
1449       else
1450         {
1451           if (scm_i_is_narrow_string (s))
1452             {
1453               const char *src = scm_i_string_chars (s);
1454               for (i = 0; i < len; i++)
1455                 data.wide[i] = (unsigned char) src[i];
1456             }
1457           else
1458             u32_cpy ((scm_t_uint32 *) data.wide,
1459                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1460           data.wide += len;
1461         }
1462       total -= len;
1463       scm_remember_upto_here_1 (s);
1464     }
1465   if (total != 0)
1466     SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1467   return res;
1468 }
1469 #undef FUNC_NAME
1470
1471
1472 \f
1473 /* Charset conversion error handling.  */
1474
1475 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1476 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1477
1478 /* Raise an exception informing that character CHR could not be written
1479    to PORT in its current encoding.  */
1480 void
1481 scm_encoding_error (const char *subr, int err, const char *message,
1482                     SCM port, SCM chr)
1483 {
1484   scm_throw (scm_encoding_error_key,
1485              scm_list_n (scm_from_latin1_string (subr),
1486                          scm_from_latin1_string (message),
1487                          scm_from_int (err),
1488                          port, chr,
1489                          SCM_UNDEFINED));
1490 }
1491
1492 /* Raise an exception informing of an encoding error on PORT.  This
1493    means that a character could not be written in PORT's encoding.  */
1494 void
1495 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1496 {
1497   scm_throw (scm_decoding_error_key,
1498              scm_list_n (scm_from_latin1_string (subr),
1499                          scm_from_latin1_string (message),
1500                          scm_from_int (err),
1501                          port,
1502                          SCM_UNDEFINED));
1503 }
1504
1505 \f
1506 /* String conversion to/from C.  */
1507
1508 static void
1509 decoding_error (const char *func_name, int errno_save,
1510                 const char *str, size_t len)
1511 {
1512   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1513      handler.  */
1514   SCM bv;
1515   signed char *buf;
1516
1517   buf = scm_gc_malloc_pointerless (len, "bytevector");
1518   memcpy (buf, str, len);
1519   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1520
1521   scm_decoding_error (func_name, errno_save,
1522                       "input locale conversion error", bv);
1523 }
1524
1525 SCM
1526 scm_from_stringn (const char *str, size_t len, const char *encoding,
1527                   scm_t_string_failed_conversion_handler handler)
1528 {
1529   size_t u32len, i;
1530   scm_t_wchar *u32;
1531   int wide = 0;
1532   SCM res;
1533
1534   /* The order of these checks is important. */
1535   if (!str && len != 0)
1536     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1537   if (len == (size_t) -1)
1538     len = strlen (str);
1539
1540   if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
1541     return scm_from_latin1_stringn (str, len);
1542   else if (c_strcasecmp (encoding, "UTF-8") == 0
1543            && handler == SCM_FAILED_CONVERSION_ERROR)
1544     return scm_from_utf8_stringn (str, len);
1545
1546   u32len = 0;
1547   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1548                                                 (enum iconv_ilseq_handler)
1549                                                 handler,
1550                                                 str, len,
1551                                                 NULL,
1552                                                 NULL, &u32len);
1553
1554   if (SCM_UNLIKELY (u32 == NULL))
1555     decoding_error (__func__, errno, str, len);
1556
1557   i = 0;
1558   while (i < u32len)
1559     if (u32[i++] > 0xFF)
1560       {
1561         wide = 1;
1562         break;
1563       }
1564
1565   if (!wide)
1566     {
1567       char *dst;
1568       res = scm_i_make_string (u32len, &dst, 0);
1569       for (i = 0; i < u32len; i ++)
1570         dst[i] = (unsigned char) u32[i];
1571       dst[u32len] = '\0';
1572     }
1573   else
1574     {
1575       scm_t_wchar *wdst;
1576       res = scm_i_make_wide_string (u32len, &wdst, 0);
1577       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1578       wdst[u32len] = 0;
1579     }
1580
1581   free (u32);
1582   return res;
1583 }
1584
1585 SCM
1586 scm_from_locale_string (const char *str)
1587 {
1588   return scm_from_locale_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_locale_stringn (const char *str, size_t len)
1593 {
1594   return scm_from_stringn (str, len, locale_charset (),
1595                            scm_i_default_port_conversion_handler ());
1596 }
1597
1598 SCM
1599 scm_from_latin1_string (const char *str)
1600 {
1601   return scm_from_latin1_stringn (str, -1);
1602 }
1603
1604 SCM
1605 scm_from_latin1_stringn (const char *str, size_t len)
1606 {
1607   char *buf;
1608   SCM result;
1609
1610   if (len == (size_t) -1)
1611     len = strlen (str);
1612
1613   /* Make a narrow string and copy STR as is.  */
1614   result = scm_i_make_string (len, &buf, 0);
1615   memcpy (buf, str, len);
1616
1617   return result;
1618 }
1619
1620 SCM
1621 scm_from_utf8_string (const char *str)
1622 {
1623   return scm_from_utf8_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_utf8_stringn (const char *str, size_t len)
1628 {
1629   size_t i, char_len;
1630   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1631   int ascii = 1, narrow = 1;
1632   SCM res;
1633
1634   if (len == (size_t) -1)
1635     len = strlen (str);
1636
1637   i = 0;
1638   char_len = 0;
1639
1640   while (i < len)
1641     {
1642       if (ustr[i] <= 127)
1643         {
1644           char_len++;
1645           i++;
1646         }
1647       else
1648         {
1649           ucs4_t c;
1650           int nbytes;
1651
1652           ascii = 0;
1653
1654           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1655
1656           if (c == 0xfffd)
1657             /* Bad UTF-8.  */
1658             decoding_error (__func__, errno, str, len);
1659
1660           if (c > 255)
1661             narrow = 0;
1662
1663           char_len++;
1664           i += nbytes;
1665         }
1666     }
1667
1668   if (ascii)
1669     {
1670       char *dst;
1671       res = scm_i_make_string (char_len, &dst, 0);
1672       memcpy (dst, str, len);
1673     }
1674   else if (narrow)
1675     {
1676       char *dst;
1677       size_t j;
1678       ucs4_t c;
1679
1680       res = scm_i_make_string (char_len, &dst, 0);
1681
1682       for (i = 0, j = 0; i < len; j++)
1683         {
1684           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1685           dst[j] = (signed char) c;
1686         }
1687     }
1688   else
1689     {
1690       scm_t_wchar *dst;
1691       size_t j;
1692       ucs4_t c;
1693
1694       res = scm_i_make_wide_string (char_len, &dst, 0);
1695
1696       for (i = 0, j = 0; i < len; j++)
1697         {
1698           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1699           dst[j] = c;
1700         }
1701     }
1702
1703   return res;
1704 }
1705
1706 SCM
1707 scm_from_utf32_string (const scm_t_wchar *str)
1708 {
1709   return scm_from_utf32_stringn (str, -1);
1710 }
1711
1712 SCM
1713 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1714 {
1715   SCM result;
1716   scm_t_wchar *buf;
1717
1718   if (len == (size_t) -1)
1719     len = u32_strlen ((uint32_t *) str);
1720
1721   result = scm_i_make_wide_string (len, &buf, 0);
1722   memcpy (buf, str, len * sizeof (scm_t_wchar));
1723   scm_i_try_narrow_string (result);
1724
1725   return result;
1726 }
1727
1728 SCM
1729 scm_from_port_string (const char *str, SCM port)
1730 {
1731   return scm_from_port_stringn (str, -1, port);
1732 }
1733
1734 SCM
1735 scm_from_port_stringn (const char *str, size_t len, SCM port)
1736 {
1737   scm_t_port *pt = SCM_PTAB_ENTRY (port);
1738   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
1739
1740   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1741     return scm_from_latin1_stringn (str, len);
1742   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1743            && (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
1744                || (u8_check ((uint8_t *) str, len) == NULL)))
1745     return scm_from_utf8_stringn (str, len);
1746   else
1747     return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1748 }
1749
1750 /* Create a new scheme string from the C string STR.  The memory of
1751    STR may be used directly as storage for the new string.  */
1752 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1753    would be to register a finalizer to eventually free(3) STR, which isn't
1754    worth it.  Should we just deprecate the `scm_take_' functions?  */
1755 SCM
1756 scm_take_locale_stringn (char *str, size_t len)
1757 {
1758   SCM res;
1759
1760   res = scm_from_locale_stringn (str, len);
1761   free (str);
1762
1763   return res;
1764 }
1765
1766 SCM
1767 scm_take_locale_string (char *str)
1768 {
1769   return scm_take_locale_stringn (str, -1);
1770 }
1771
1772 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1773    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1774    Set *LENP to the size of the resulting string.
1775
1776    FIXME: This is a hack we should get rid of.  See
1777    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1778    for details.  */
1779 static void
1780 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1781 {
1782   char *before, *after;
1783   size_t i, j;
1784
1785   before = buf;
1786   after = buf;
1787   i = 0;
1788   j = 0;
1789   while (i < *lenp)
1790     {
1791       if ((i <= *lenp - 6)
1792           && before[i] == '\\'
1793           && before[i + 1] == 'u'
1794           && before[i + 2] == '0' && before[i + 3] == '0')
1795         {
1796           /* Convert \u00NN to \xNN */
1797           after[j] = '\\';
1798           after[j + 1] = 'x';
1799           after[j + 2] = tolower ((int) before[i + 4]);
1800           after[j + 3] = tolower ((int) before[i + 5]);
1801           i += 6;
1802           j += 4;
1803         }
1804       else if ((i <= *lenp - 10)
1805                && before[i] == '\\'
1806                && before[i + 1] == 'U'
1807                && before[i + 2] == '0' && before[i + 3] == '0')
1808         {
1809           /* Convert \U00NNNNNN to \UNNNNNN */
1810           after[j] = '\\';
1811           after[j + 1] = 'U';
1812           after[j + 2] = tolower ((int) before[i + 4]);
1813           after[j + 3] = tolower ((int) before[i + 5]);
1814           after[j + 4] = tolower ((int) before[i + 6]);
1815           after[j + 5] = tolower ((int) before[i + 7]);
1816           after[j + 6] = tolower ((int) before[i + 8]);
1817           after[j + 7] = tolower ((int) before[i + 9]);
1818           i += 10;
1819           j += 8;
1820         }
1821       else
1822         {
1823           after[j] = before[i];
1824           i++;
1825           j++;
1826         }
1827     }
1828   *lenp = j;
1829 }
1830
1831 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1832    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1833    of the resulting string.  BUF must be large enough to handle the
1834    worst case when `\uXXXX' escapes (6 characters) are replaced by
1835    `\xXXXX;' (7 characters).  */
1836 static void
1837 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1838 {
1839   char *before, *after;
1840   size_t i, j;
1841   /* The worst case is if the input string contains all 4-digit hex escapes.
1842      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1843   size_t max_out_len = (*lenp * 7) / 6 + 1;
1844   size_t nzeros, ndigits;
1845
1846   before = buf;
1847   after = alloca (max_out_len);
1848   i = 0;
1849   j = 0;
1850   while (i < *lenp)
1851     {
1852       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1853           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1854         {
1855           if (before[i + 1] == 'u')
1856             ndigits = 4;
1857           else if (before[i + 1] == 'U')
1858             ndigits = 8;
1859           else
1860             abort ();
1861
1862           /* Add the R6RS hex escape initial sequence.  */
1863           after[j] = '\\';
1864           after[j + 1] = 'x';
1865
1866           /* Move string positions to the start of the hex numbers.  */
1867           i += 2;
1868           j += 2;
1869
1870           /* Find the number of initial zeros in this hex number.  */
1871           nzeros = 0;
1872           while (before[i + nzeros] == '0' && nzeros < ndigits)
1873             nzeros++;
1874
1875           /* Copy the number, skipping initial zeros, and then move the string
1876              positions.  */
1877           if (nzeros == ndigits)
1878             {
1879               after[j] = '0';
1880               i += ndigits;
1881               j += 1;
1882             }
1883           else
1884             {
1885               int pos;
1886               for (pos = 0; pos < ndigits - nzeros; pos++)
1887                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1888               i += ndigits;
1889               j += (ndigits - nzeros);
1890             }
1891
1892           /* Add terminating semicolon.  */
1893           after[j] = ';';
1894           j++;
1895         }
1896       else
1897         {
1898           after[j] = before[i];
1899           i++;
1900           j++;
1901         }
1902     }
1903   *lenp = j;
1904   memcpy (before, after, j);
1905 }
1906
1907 char *
1908 scm_to_locale_string (SCM str)
1909 {
1910   return scm_to_locale_stringn (str, NULL);
1911 }
1912
1913 char *
1914 scm_to_locale_stringn (SCM str, size_t *lenp)
1915 {
1916   return scm_to_stringn (str, lenp,
1917                          locale_charset (),
1918                          scm_i_default_port_conversion_handler ());
1919 }
1920
1921 char *
1922 scm_to_latin1_string (SCM str)
1923 {
1924   return scm_to_latin1_stringn (str, NULL);
1925 }
1926
1927 char *
1928 scm_to_latin1_stringn (SCM str, size_t *lenp)
1929 #define FUNC_NAME "scm_to_latin1_stringn"
1930 {
1931   char *result;
1932
1933   SCM_VALIDATE_STRING (1, str);
1934
1935   if (scm_i_is_narrow_string (str))
1936     {
1937       size_t len = scm_i_string_length (str);
1938
1939       if (lenp)
1940         *lenp = len;
1941
1942       result = scm_strndup (scm_i_string_data (str), len);
1943     }
1944   else
1945     result = scm_to_stringn (str, lenp, NULL,
1946                              SCM_FAILED_CONVERSION_ERROR);
1947
1948   return result;
1949 }
1950 #undef FUNC_NAME
1951
1952 char *
1953 scm_to_utf8_string (SCM str)
1954 {
1955   return scm_to_utf8_stringn (str, NULL);
1956 }
1957
1958 static size_t
1959 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1960 {
1961   size_t ret, i;
1962   for (i = 0, ret = 0; i < len; i++)
1963     ret += (str[i] < 128) ? 1 : 2;
1964   return ret;
1965 }
1966
1967 static scm_t_uint8*
1968 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1969               scm_t_uint8 *u8_result, size_t *u8_lenp)
1970 {
1971   size_t i, n;
1972   size_t u8_len = latin1_u8_strlen (str, latin_len);
1973
1974   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1975     u8_result = scm_malloc (u8_len + 1);
1976   if (u8_lenp)
1977     *u8_lenp = u8_len;
1978
1979   for (i = 0, n = 0; i < latin_len; i++)
1980     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1981   if (n != u8_len)
1982     abort ();
1983   u8_result[n] = 0;
1984
1985   return u8_result;
1986 }
1987
1988 /* UTF-8 code table
1989
1990    (Note that this includes code points that are not allowed by Unicode,
1991     but since this function has no way to report an error, and its
1992     purpose is to determine the size of destination buffers for
1993     libunicode conversion functions, we err on the safe side and handle
1994     everything that libunicode might conceivably handle, now or in the
1995     future.)
1996
1997    Char. number range  |        UTF-8 octet sequence
1998       (hexadecimal)    |              (binary)
1999    --------------------+------------------------------------------------------
2000    0000 0000-0000 007F | 0xxxxxxx
2001    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2002    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2003    0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2004    0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2005    0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2006 */
2007
2008 static size_t
2009 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2010 {
2011   size_t ret, i;
2012
2013   for (i = 0, ret = 0; i < len; i++)
2014     {
2015       scm_t_uint32 c = str[i];
2016
2017       if (c <= 0x7f)
2018         ret += 1;
2019       else if (c <= 0x7ff)
2020         ret += 2;
2021       else if (c <= 0xffff)
2022         ret += 3;
2023       else if (c <= 0x1fffff)
2024         ret += 4;
2025       else if (c <= 0x3ffffff)
2026         ret += 5;
2027       else
2028         ret += 6;
2029     }
2030
2031   return ret;
2032 }
2033
2034 char *
2035 scm_to_utf8_stringn (SCM str, size_t *lenp)
2036 #define FUNC_NAME "scm_to_utf8_stringn"
2037 {
2038   SCM_VALIDATE_STRING (1, str);
2039
2040   if (scm_i_is_narrow_string (str))
2041     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2042                                   scm_i_string_length (str),
2043                                   NULL, lenp);
2044   else
2045     {
2046       scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2047       scm_t_uint8 *buf, *ret;
2048       size_t num_chars = scm_i_string_length (str);
2049       size_t num_bytes_predicted, num_bytes_actual;
2050
2051       num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2052
2053       if (lenp)
2054         {
2055           *lenp = num_bytes_predicted;
2056           buf = scm_malloc (num_bytes_predicted);
2057         }
2058       else
2059         {
2060           buf = scm_malloc (num_bytes_predicted + 1);
2061           buf[num_bytes_predicted] = 0;
2062         }
2063
2064       num_bytes_actual = num_bytes_predicted;
2065       ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2066
2067       if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2068         return (char *) ret;
2069
2070       /* An error: a bad codepoint.  */
2071       {
2072         int saved_errno = errno;
2073
2074         free (buf);
2075         if (!saved_errno)
2076           abort ();
2077
2078         scm_decoding_error ("scm_to_utf8_stringn", errno,
2079                             "invalid codepoint in string", str);
2080
2081         /* Not reached.  */
2082         return NULL;
2083       }
2084     }
2085 }
2086 #undef FUNC_NAME
2087
2088 scm_t_wchar *
2089 scm_to_utf32_string (SCM str)
2090 {
2091   return scm_to_utf32_stringn (str, NULL);
2092 }
2093
2094 scm_t_wchar *
2095 scm_to_utf32_stringn (SCM str, size_t *lenp)
2096 #define FUNC_NAME "scm_to_utf32_stringn"
2097 {
2098   scm_t_wchar *result;
2099
2100   SCM_VALIDATE_STRING (1, str);
2101
2102   if (scm_i_is_narrow_string (str))
2103     {
2104       scm_t_uint8 *codepoints;
2105       size_t i, len;
2106
2107       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2108       len = scm_i_string_length (str);
2109       if (lenp)
2110         *lenp = len;
2111
2112       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2113       for (i = 0; i < len; i++)
2114         result[i] = codepoints[i];
2115       result[len] = 0;
2116     }
2117   else
2118     {
2119       size_t len;
2120
2121       len = scm_i_string_length (str);
2122       if (lenp)
2123         *lenp = len;
2124
2125       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2126       memcpy (result, scm_i_string_wide_chars (str),
2127               len * sizeof (scm_t_wchar));
2128       result[len] = 0;
2129     }
2130
2131   return result;
2132 }
2133 #undef FUNC_NAME
2134
2135 char *
2136 scm_to_port_string (SCM str, SCM port)
2137 {
2138   return scm_to_port_stringn (str, NULL, port);
2139 }
2140
2141 char *
2142 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2143 {
2144   scm_t_port *pt = SCM_PTAB_ENTRY (port);
2145   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
2146
2147   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2148       && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2149     return scm_to_latin1_stringn (str, lenp);
2150   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2151     return scm_to_utf8_stringn (str, lenp);
2152   else
2153     return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2154 }
2155
2156 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2157    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2158    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2159    defined by HANDLER.  */
2160 char *
2161 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2162                 scm_t_string_failed_conversion_handler handler)
2163 {
2164   char *buf;
2165   size_t ilen, len, i;
2166   int ret;
2167   const char *enc;
2168
2169   if (!scm_is_string (str))
2170     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2171   ilen = scm_i_string_length (str);
2172
2173   if (ilen == 0)
2174     {
2175       buf = scm_malloc (1);
2176       buf[0] = '\0';
2177       if (lenp)
2178         *lenp = 0;
2179       return buf;
2180     }
2181
2182   if (lenp == NULL)
2183     for (i = 0; i < ilen; i++)
2184       if (scm_i_string_ref (str, i) == '\0')
2185         scm_misc_error (NULL,
2186                         "string contains #\\nul character: ~S",
2187                         scm_list_1 (str));
2188
2189   if (scm_i_is_narrow_string (str)
2190       && c_strcasecmp (encoding, "ISO-8859-1") == 0)
2191     {
2192       /* If using native Latin-1 encoding, just copy the string
2193          contents.  */
2194       if (lenp)
2195         {
2196           buf = scm_malloc (ilen);
2197           memcpy (buf, scm_i_string_chars (str), ilen);
2198           *lenp = ilen;
2199           return buf;
2200         }
2201       else
2202         {
2203           buf = scm_malloc (ilen + 1);
2204           memcpy (buf, scm_i_string_chars (str), ilen);
2205           buf[ilen] = '\0';
2206           return buf;
2207         }
2208     }
2209
2210
2211   buf = NULL;
2212   len = 0;
2213   enc = encoding;
2214   if (enc == NULL)
2215     enc = "ISO-8859-1";
2216   if (scm_i_is_narrow_string (str))
2217     {
2218       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2219                          "ISO-8859-1", enc,
2220                          (enum iconv_ilseq_handler) handler, NULL,
2221                          &buf, &len);
2222
2223       if (ret != 0)
2224         scm_encoding_error (__func__, errno,
2225                             "cannot convert narrow string to output locale",
2226                             SCM_BOOL_F,
2227                             /* FIXME: Faulty character unknown.  */
2228                             SCM_BOOL_F);
2229     }
2230   else
2231     {
2232       buf = u32_conv_to_encoding (enc,
2233                                   (enum iconv_ilseq_handler) handler,
2234                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2235                                   ilen,
2236                                   NULL,
2237                                   NULL, &len);
2238       if (buf == NULL)
2239         scm_encoding_error (__func__, errno,
2240                             "cannot convert wide string to output locale",
2241                             SCM_BOOL_F,
2242                             /* FIXME: Faulty character unknown.  */
2243                             SCM_BOOL_F);
2244     }
2245   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2246     {
2247       if (SCM_R6RS_ESCAPES_P)
2248         {
2249           /* The worst case is if the input string contains all 4-digit
2250              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2251              (seven characters).  Make BUF large enough to hold
2252              that.  */
2253           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2254           unistring_escapes_to_r6rs_escapes (buf, &len);
2255         }
2256       else
2257         unistring_escapes_to_guile_escapes (buf, &len);
2258
2259       buf = scm_realloc (buf, len);
2260     }
2261   if (lenp)
2262     *lenp = len;
2263   else
2264     {
2265       buf = scm_realloc (buf, len + 1);
2266       buf[len] = '\0';
2267     }
2268
2269   scm_remember_upto_here_1 (str);
2270   return buf;
2271 }
2272
2273 size_t
2274 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2275 {
2276   size_t len;
2277   char *result = NULL;
2278   if (!scm_is_string (str))
2279     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2280   result = scm_to_locale_stringn (str, &len);
2281
2282   memcpy (buf, result, (len > max_len) ? max_len : len);
2283   free (result);
2284
2285   scm_remember_upto_here_1 (str);
2286   return len;
2287 }
2288
2289 \f
2290 /* Unicode string normalization.  */
2291
2292 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2293    libguile/i18n.c.  It would be useful to have this factored out into a more
2294    convenient location, but its use of alloca makes that tricky to do. */
2295
2296 static SCM
2297 normalize_str (SCM string, uninorm_t form)
2298 {
2299   SCM ret;
2300   scm_t_uint32 *w_str;
2301   scm_t_wchar *cbuf;
2302   size_t rlen, len = scm_i_string_length (string);
2303
2304   if (scm_i_is_narrow_string (string))
2305     {
2306       size_t i;
2307       const char *buf = scm_i_string_chars (string);
2308
2309       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2310
2311       for (i = 0; i < len; i ++)
2312         w_str[i] = (unsigned char) buf[i];
2313       w_str[len] = 0;
2314     }
2315   else
2316     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2317
2318   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2319
2320   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2321   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2322   free (w_str);
2323
2324   scm_i_try_narrow_string (ret);
2325
2326   return ret;
2327 }
2328
2329 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2330             (SCM string),
2331             "Returns the NFC normalized form of @var{string}.")
2332 #define FUNC_NAME s_scm_string_normalize_nfc
2333 {
2334   SCM_VALIDATE_STRING (1, string);
2335   return normalize_str (string, UNINORM_NFC);
2336 }
2337 #undef FUNC_NAME
2338
2339 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2340             (SCM string),
2341             "Returns the NFD normalized form of @var{string}.")
2342 #define FUNC_NAME s_scm_string_normalize_nfd
2343 {
2344   SCM_VALIDATE_STRING (1, string);
2345   return normalize_str (string, UNINORM_NFD);
2346 }
2347 #undef FUNC_NAME
2348
2349 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2350             (SCM string),
2351             "Returns the NFKC normalized form of @var{string}.")
2352 #define FUNC_NAME s_scm_string_normalize_nfkc
2353 {
2354   SCM_VALIDATE_STRING (1, string);
2355   return normalize_str (string, UNINORM_NFKC);
2356 }
2357 #undef FUNC_NAME
2358
2359 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2360             (SCM string),
2361             "Returns the NFKD normalized form of @var{string}.")
2362 #define FUNC_NAME s_scm_string_normalize_nfkd
2363 {
2364   SCM_VALIDATE_STRING (1, string);
2365   return normalize_str (string, UNINORM_NFKD);
2366 }
2367 #undef FUNC_NAME
2368
2369 /* converts C scm_array of strings to SCM scm_list of strings.
2370    If argc < 0, a null terminated scm_array is assumed.
2371    The current locale encoding is assumed */
2372 SCM
2373 scm_makfromstrs (int argc, char **argv)
2374 {
2375   int i = argc;
2376   SCM lst = SCM_EOL;
2377   if (0 > i)
2378     for (i = 0; argv[i]; i++);
2379   while (i--)
2380     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2381   return lst;
2382 }
2383
2384 /* Return a newly allocated array of char pointers to each of the strings
2385    in args, with a terminating NULL pointer.  The strings are encoded using
2386    the current locale. */
2387
2388 char **
2389 scm_i_allocate_string_pointers (SCM list)
2390 #define FUNC_NAME "scm_i_allocate_string_pointers"
2391 {
2392   char **result;
2393   int list_len = scm_ilength (list);
2394   int i;
2395
2396   if (list_len < 0)
2397     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2398
2399   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2400                           "string pointers");
2401   result[list_len] = NULL;
2402
2403   /* The list might have been modified in another thread, so
2404      we check LIST before each access.
2405    */
2406   for (i = 0; i < list_len && scm_is_pair (list); i++)
2407     {
2408       SCM str = SCM_CAR (list);
2409       size_t len;  /* String length in bytes */
2410       char *c_str = scm_to_locale_stringn (str, &len);
2411
2412       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2413          scm_malloc to allocate the returned string, which must be
2414          explicitly deallocated.  This forces us to copy the string a
2415          second time into a new buffer.  Ideally there would be variants
2416          of scm_to_*_stringn that can return garbage-collected buffers. */
2417
2418       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2419       memcpy (result[i], c_str, len);
2420       result[i][len] = '\0';
2421       free (c_str);
2422
2423       list = SCM_CDR (list);
2424     }
2425
2426   return result;
2427 }
2428 #undef FUNC_NAME
2429
2430 void
2431 scm_i_get_substring_spec (size_t len,
2432                           SCM start, size_t *cstart,
2433                           SCM end, size_t *cend)
2434 {
2435   if (SCM_UNBNDP (start))
2436     *cstart = 0;
2437   else
2438     *cstart = scm_to_unsigned_integer (start, 0, len);
2439
2440   if (SCM_UNBNDP (end))
2441     *cend = len;
2442   else
2443     *cend = scm_to_unsigned_integer (end, *cstart, len);
2444 }
2445
2446 static SCM
2447 string_handle_ref (scm_t_array_handle *h, size_t index)
2448 {
2449   return scm_c_string_ref (h->array, index);
2450 }
2451
2452 static void
2453 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2454 {
2455   scm_c_string_set_x (h->array, index, val);
2456 }
2457
2458 static void
2459 string_get_handle (SCM v, scm_t_array_handle *h)
2460 {
2461   h->array = v;
2462   h->ndims = 1;
2463   h->dims = &h->dim0;
2464   h->dim0.lbnd = 0;
2465   h->dim0.ubnd = scm_c_string_length (v) - 1;
2466   h->dim0.inc = 1;
2467   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2468   h->elements = h->writable_elements = NULL;
2469 }
2470
2471 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2472                           string_handle_ref, string_handle_set,
2473                           string_get_handle)
2474 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2475
2476 void
2477 scm_init_strings ()
2478 {
2479   scm_nullstr = scm_i_make_string (0, NULL, 0);
2480
2481 #include "libguile/strings.x"
2482 }
2483
2484
2485 /*
2486   Local Variables:
2487   c-file-style: "gnu"
2488   End:
2489 */