libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32 #include <c-strcase.h>
  33
  34 #include "striconveh.h"
  35
  36 #include "libguile/_scm.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/root.h"
  39 #include "libguile/strings.h"
  40 #include "libguile/ports.h"
  41 #include "libguile/ports-internal.h"
  42 #include "libguile/error.h"
  43 #include "libguile/generalized-vectors.h"
  44 #include "libguile/deprecation.h"
  45 #include "libguile/validate.h"
  46 #include "libguile/private-options.h"
  47
  48 \f
  49
  50 /* {Strings}
  51  */
  52
  53
  54 /* Stringbufs
  55  *
  56  * XXX - keeping an accurate refcount during GC seems to be quite
  57  * tricky, so we just keep score of whether a stringbuf might be
  58  * shared, not whether it definitely is.
  59  *
  60  * The scheme I (mvo) tried to keep an accurate reference count would
  61  * recount all strings that point to a stringbuf during the mark-phase
  62  * of the GC.  This was done since one cannot access the stringbuf of
  63  * a string when that string is freed (in order to decrease the
  64  * reference count).  The memory of the stringbuf might have been
  65  * reused already for something completely different.
  66  *
  67  * This recounted worked for a small number of threads beating on
  68  * cow-strings, but it failed randomly with more than 10 threads, say.
  69  * I couldn't figure out what went wrong, so I used the conservative
  70  * approach implemented below.
  71  *
  72  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  73  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  74  * strings.
  75  */
  76
  77 /* The size in words of the stringbuf header (type tag + size).  */
  78 #define STRINGBUF_HEADER_SIZE   2U
  79
  80 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  81
  82 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  83 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  84
  85 #define STRINGBUF_TAG           scm_tc7_stringbuf
  86 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  87 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  88
  89 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  90                                  SCM_CELL_OBJECT_LOC (buf,              \
  91                                                       STRINGBUF_HEADER_SIZE))
  92 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  94
  95 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  96
  97 #define SET_STRINGBUF_SHARED(buf)                                       \
  98   do                                                                    \
  99     {                                                                   \
 100       /* Don't modify BUF if it's already marked as shared since it might be \
 101          a read-only, statically allocated stringbuf.  */               \
 102       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 103         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 104     }                                                                   \
 105   while (0)
 106
 107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 108 static size_t lenhist[1001];
 109 #endif
 110
 111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 112    characters. */
 113 static SCM
 114 make_stringbuf (size_t len)
 115 {
 116   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 117      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 118      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 119      has been changed for scm_i_symbol_chars, this null-termination
 120      can be dropped.
 121   */
 122
 123   SCM buf;
 124
 125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 126   if (len < 1000)
 127     lenhist[len]++;
 128   else
 129     lenhist[1000]++;
 130 #endif
 131
 132   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 133                                             "string"));
 134
 135   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 136   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 137
 138   STRINGBUF_CHARS (buf)[len] = 0;
 139
 140   return buf;
 141 }
 142
 143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 144    characters.  */
 145 static SCM
 146 make_wide_stringbuf (size_t len)
 147 {
 148   SCM buf;
 149   size_t raw_len;
 150
 151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 152   if (len < 1000)
 153     lenhist[len]++;
 154   else
 155     lenhist[1000]++;
 156 #endif
 157
 158   raw_len = (len + 1) * sizeof (scm_t_wchar);
 159   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 160                                             "string"));
 161
 162   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 163   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 164
 165   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 166
 167   return buf;
 168 }
 169
 170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 171    characters from BUF.  */
 172 static SCM
 173 wide_stringbuf (SCM buf)
 174 {
 175   SCM new_buf;
 176
 177   if (STRINGBUF_WIDE (buf))
 178     new_buf = buf;
 179   else
 180     {
 181       size_t i, len;
 182       scm_t_wchar *mem;
 183
 184       len = STRINGBUF_LENGTH (buf);
 185
 186       new_buf = make_wide_stringbuf (len);
 187
 188       mem = STRINGBUF_WIDE_CHARS (new_buf);
 189       for (i = 0; i < len; i++)
 190         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 191       mem[len] = 0;
 192     }
 193
 194   return new_buf;
 195 }
 196
 197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 198    characters from BUF, if possible.  */
 199 static SCM
 200 narrow_stringbuf (SCM buf)
 201 {
 202   SCM new_buf;
 203
 204   if (!STRINGBUF_WIDE (buf))
 205     new_buf = buf;
 206   else
 207     {
 208       size_t i, len;
 209       scm_t_wchar *wmem;
 210       unsigned char *mem;
 211
 212       len = STRINGBUF_LENGTH (buf);
 213       wmem = STRINGBUF_WIDE_CHARS (buf);
 214
 215       for (i = 0; i < len; i++)
 216         if (wmem[i] > 0xFF)
 217           /* BUF cannot be narrowed.  */
 218           return buf;
 219
 220       new_buf = make_stringbuf (len);
 221
 222       mem = STRINGBUF_CHARS (new_buf);
 223       for (i = 0; i < len; i++)
 224         mem[i] = (unsigned char) wmem[i];
 225       mem[len] = 0;
 226     }
 227
 228   return new_buf;
 229 }
 230
 231 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 232
 233 \f
 234 /* Copy-on-write strings.
 235  */
 236
 237 #define STRING_TAG            scm_tc7_string
 238
 239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 240 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 241 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 242
 243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 245
 246 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 247
 248 /* Read-only strings.
 249  */
 250
 251 #define RO_STRING_TAG         scm_tc7_ro_string
 252 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 253
 254 /* Mutation-sharing substrings
 255  */
 256
 257 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 258
 259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 260 /* START and LENGTH as for STRINGs. */
 261
 262 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 263
 264 SCM scm_nullstr;
 265
 266 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 267    characters.  CHARSP, if not NULL, will be set to location of the
 268    char array.  If READ_ONLY_P, the returned string is read-only;
 269    otherwise it is writable.  */
 270 SCM
 271 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 272 {
 273   static SCM null_stringbuf = SCM_BOOL_F;
 274   SCM buf;
 275   SCM res;
 276
 277   if (len == 0)
 278     {
 279       if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
 280         {
 281           null_stringbuf = make_stringbuf (0);
 282           SET_STRINGBUF_SHARED (null_stringbuf);
 283         }
 284       buf = null_stringbuf;
 285     }
 286   else
 287     buf = make_stringbuf (len);
 288
 289   if (charsp)
 290     *charsp = (char *) STRINGBUF_CHARS (buf);
 291   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 292                          SCM_UNPACK (buf),
 293                          (scm_t_bits) 0, (scm_t_bits) len);
 294   return res;
 295 }
 296
 297 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 298    characters.  CHARSP, if not NULL, will be set to location of the
 299    character array.  If READ_ONLY_P, the returned string is read-only;
 300    otherwise it is writable.  */
 301 SCM
 302 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 303 {
 304   SCM buf = make_wide_stringbuf (len);
 305   SCM res;
 306   if (charsp)
 307     *charsp = STRINGBUF_WIDE_CHARS (buf);
 308   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 309                          SCM_UNPACK (buf),
 310                          (scm_t_bits) 0, (scm_t_bits) len);
 311   return res;
 312 }
 313
 314 static void
 315 validate_substring_args (SCM str, size_t start, size_t end)
 316 {
 317   if (!IS_STRING (str))
 318     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 319   if (start > STRING_LENGTH (str))
 320     scm_out_of_range (NULL, scm_from_size_t (start));
 321   if (end > STRING_LENGTH (str) || end < start)
 322     scm_out_of_range (NULL, scm_from_size_t (end));
 323 }
 324
 325 static inline void
 326 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 327 {
 328   *start = STRING_START (*str);
 329   if (IS_SH_STRING (*str))
 330     {
 331       *str = SH_STRING_STRING (*str);
 332       *start += STRING_START (*str);
 333     }
 334   *buf = STRING_STRINGBUF (*str);
 335 }
 336
 337 SCM
 338 scm_i_substring (SCM str, size_t start, size_t end)
 339 {
 340   if (start == end)
 341     return scm_i_make_string (0, NULL, 0);
 342   else
 343     {
 344       SCM buf;
 345       size_t str_start;
 346       get_str_buf_start (&str, &buf, &str_start);
 347       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 348       SET_STRINGBUF_SHARED (buf);
 349       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 350       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 351                               (scm_t_bits)str_start + start,
 352                               (scm_t_bits) end - start);
 353     }
 354 }
 355
 356 SCM
 357 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 358 {
 359   if (start == end)
 360     return scm_i_make_string (0, NULL, 1);
 361   else
 362     {
 363       SCM buf;
 364       size_t str_start;
 365       get_str_buf_start (&str, &buf, &str_start);
 366       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 367       SET_STRINGBUF_SHARED (buf);
 368       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 369       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 370                               (scm_t_bits)str_start + start,
 371                               (scm_t_bits) end - start);
 372     }
 373 }
 374
 375 SCM
 376 scm_i_substring_copy (SCM str, size_t start, size_t end)
 377 {
 378   if (start == end)
 379     return scm_i_make_string (0, NULL, 0);
 380   else
 381     {
 382       size_t len = end - start;
 383       SCM buf, my_buf, substr;
 384       size_t str_start;
 385       int wide = 0;
 386       get_str_buf_start (&str, &buf, &str_start);
 387       if (scm_i_is_narrow_string (str))
 388         {
 389           my_buf = make_stringbuf (len);
 390           memcpy (STRINGBUF_CHARS (my_buf),
 391                   STRINGBUF_CHARS (buf) + str_start + start, len);
 392         }
 393       else
 394         {
 395           my_buf = make_wide_stringbuf (len);
 396           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 397                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 398                                      + start), len);
 399           wide = 1;
 400         }
 401       scm_remember_upto_here_1 (buf);
 402       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 403                                 (scm_t_bits) 0, (scm_t_bits) len);
 404       if (wide)
 405         scm_i_try_narrow_string (substr);
 406       return substr;
 407     }
 408 }
 409
 410 SCM
 411 scm_i_substring_shared (SCM str, size_t start, size_t end)
 412 {
 413   if (start == 0 && end == STRING_LENGTH (str))
 414     return str;
 415   else if (start == end)
 416     return scm_i_make_string (0, NULL, 0);
 417   else
 418     {
 419       size_t len = end - start;
 420       if (IS_SH_STRING (str))
 421         {
 422           start += STRING_START (str);
 423           str = SH_STRING_STRING (str);
 424         }
 425       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 426                               (scm_t_bits)start, (scm_t_bits) len);
 427     }
 428 }
 429
 430 SCM
 431 scm_c_substring (SCM str, size_t start, size_t end)
 432 {
 433   validate_substring_args (str, start, end);
 434   return scm_i_substring (str, start, end);
 435 }
 436
 437 SCM
 438 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 439 {
 440   validate_substring_args (str, start, end);
 441   return scm_i_substring_read_only (str, start, end);
 442 }
 443
 444 SCM
 445 scm_c_substring_copy (SCM str, size_t start, size_t end)
 446 {
 447   validate_substring_args (str, start, end);
 448   return scm_i_substring_copy (str, start, end);
 449 }
 450
 451 SCM
 452 scm_c_substring_shared (SCM str, size_t start, size_t end)
 453 {
 454   validate_substring_args (str, start, end);
 455   return scm_i_substring_shared (str, start, end);
 456 }
 457
 458 \f
 459 /* Internal accessors
 460  */
 461
 462 /* Returns the number of characters in STR.  This may be different
 463    than the memory size of the string storage.  */
 464 size_t
 465 scm_i_string_length (SCM str)
 466 {
 467   return STRING_LENGTH (str);
 468 }
 469
 470 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 471    encoding.  False if it is 'wide', having a 32-bit UCS-4
 472    encoding.  */
 473 int
 474 scm_i_is_narrow_string (SCM str)
 475 {
 476   if (IS_SH_STRING (str))
 477     str = SH_STRING_STRING (str);
 478
 479   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 480 }
 481
 482 /* Try to coerce a string to be narrow.  It if is narrow already, do
 483    nothing.  If it is wide, shrink it to narrow if none of its
 484    characters are above 0xFF.  Return true if the string is narrow or
 485    was made to be narrow.  */
 486 int
 487 scm_i_try_narrow_string (SCM str)
 488 {
 489   if (IS_SH_STRING (str))
 490     str = SH_STRING_STRING (str);
 491
 492   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 493
 494   return scm_i_is_narrow_string (str);
 495 }
 496
 497 /* Return a pointer to the raw data of the string, which can be either Latin-1
 498    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 499 const void *
 500 scm_i_string_data (SCM str)
 501 {
 502   SCM buf;
 503   size_t start;
 504   const char *data;
 505
 506   get_str_buf_start (&str, &buf, &start);
 507
 508   data = STRINGBUF_CONTENTS (buf);
 509   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 510
 511   return data;
 512 }
 513
 514 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 515    STR.  */
 516 const char *
 517 scm_i_string_chars (SCM str)
 518 {
 519   SCM buf;
 520   size_t start;
 521   get_str_buf_start (&str, &buf, &start);
 522   if (scm_i_is_narrow_string (str))
 523     return (const char *) STRINGBUF_CHARS (buf) + start;
 524   else
 525     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 526                     scm_list_1 (str));
 527   return NULL;
 528 }
 529
 530 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 531    STR.  */
 532 const scm_t_wchar *
 533 scm_i_string_wide_chars (SCM str)
 534 {
 535   SCM buf;
 536   size_t start;
 537
 538   get_str_buf_start (&str, &buf, &start);
 539   if (!scm_i_is_narrow_string (str))
 540     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 541   else
 542     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 543                     scm_list_1 (str));
 544 }
 545
 546 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 547    a new string buffer, so that it can be modified without modifying
 548    other strings.  Also, lock the string mutex.  Later, one must call
 549    scm_i_string_stop_writing to unlock the mutex.  */
 550 SCM
 551 scm_i_string_start_writing (SCM orig_str)
 552 {
 553   SCM buf, str = orig_str;
 554   size_t start;
 555
 556   get_str_buf_start (&str, &buf, &start);
 557   if (IS_RO_STRING (str))
 558     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 559
 560   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 561   if (STRINGBUF_SHARED (buf))
 562     {
 563       /* Clone the stringbuf.  */
 564       size_t len = STRING_LENGTH (str);
 565       SCM new_buf;
 566
 567       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 568
 569       if (scm_i_is_narrow_string (str))
 570         {
 571           new_buf = make_stringbuf (len);
 572           memcpy (STRINGBUF_CHARS (new_buf),
 573                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 574
 575         }
 576       else
 577         {
 578           new_buf = make_wide_stringbuf (len);
 579           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 580                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 581                                      + STRING_START (str)), len);
 582         }
 583
 584       SET_STRING_STRINGBUF (str, new_buf);
 585       start -= STRING_START (str);
 586
 587       /* FIXME: The following operations are not atomic, so other threads
 588          looking at STR may see an inconsistent state.  Nevertheless it can't
 589          hurt much since (i) accessing STR while it is being mutated can't
 590          yield a crash, and (ii) concurrent accesses to STR should be
 591          protected by a mutex at the application level.  The latter may not
 592          apply when STR != ORIG_STR, though.  */
 593       SET_STRING_START (str, 0);
 594       SET_STRING_STRINGBUF (str, new_buf);
 595
 596       buf = new_buf;
 597
 598       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 599     }
 600   return orig_str;
 601 }
 602
 603 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 604 char *
 605 scm_i_string_writable_chars (SCM str)
 606 {
 607   SCM buf;
 608   size_t start;
 609
 610   get_str_buf_start (&str, &buf, &start);
 611   if (scm_i_is_narrow_string (str))
 612     return (char *) STRINGBUF_CHARS (buf) + start;
 613   else
 614     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 615                     scm_list_1 (str));
 616   return NULL;
 617 }
 618
 619 /* Return a pointer to the UCS-4 codepoints of a string.  */
 620 static scm_t_wchar *
 621 scm_i_string_writable_wide_chars (SCM str)
 622 {
 623   SCM buf;
 624   size_t start;
 625
 626   get_str_buf_start (&str, &buf, &start);
 627   if (!scm_i_is_narrow_string (str))
 628     return STRINGBUF_WIDE_CHARS (buf) + start;
 629   else
 630     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 631                     scm_list_1 (str));
 632 }
 633
 634 /* Unlock the string mutex that was locked when
 635    scm_i_string_start_writing was called.  */
 636 void
 637 scm_i_string_stop_writing (void)
 638 {
 639   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 640 }
 641
 642 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 643 scm_t_wchar
 644 scm_i_string_ref (SCM str, size_t x)
 645 {
 646   if (scm_i_is_narrow_string (str))
 647     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 648   else
 649     return scm_i_string_wide_chars (str)[x];
 650 }
 651
 652 /* Returns index+1 of the first char in STR that matches C, or
 653    0 if the char is not found.  */
 654 int
 655 scm_i_string_contains_char (SCM str, char ch)
 656 {
 657   size_t i;
 658   size_t len = scm_i_string_length (str);
 659
 660   i = 0;
 661   if (scm_i_is_narrow_string (str))
 662     {
 663       while (i < len)
 664         {
 665           if (scm_i_string_chars (str)[i] == ch)
 666             return i+1;
 667           i++;
 668         }
 669     }
 670   else
 671     {
 672       while (i < len)
 673         {
 674           if (scm_i_string_wide_chars (str)[i]
 675               == (unsigned char) ch)
 676             return i+1;
 677           i++;
 678         }
 679     }
 680   return 0;
 681 }
 682
 683 int
 684 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 685 {
 686   if (scm_i_is_narrow_string (sstr))
 687     {
 688       const char *a = scm_i_string_chars (sstr) + start_x;
 689       const char *b = cstr;
 690       return strncmp (a, b, strlen(b));
 691     }
 692   else
 693     {
 694       size_t i;
 695       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 696       const char *b = cstr;
 697       for (i = 0; i < strlen (b); i++)
 698         {
 699           if (a[i] != (unsigned char) b[i])
 700             return 1;
 701         }
 702     }
 703   return 0;
 704 }
 705
 706 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 707 void
 708 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 709 {
 710   if (IS_SH_STRING (str))
 711     {
 712       p += STRING_START (str);
 713       str = SH_STRING_STRING (str);
 714     }
 715
 716   if (chr > 0xFF && scm_i_is_narrow_string (str))
 717     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 718
 719   if (scm_i_is_narrow_string (str))
 720     {
 721       char *dst = scm_i_string_writable_chars (str);
 722       dst[p] = chr;
 723     }
 724   else
 725     {
 726       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 727       dst[p] = chr;
 728     }
 729 }
 730
 731 \f
 732 /* Symbols.
 733
 734    Basic symbol creation and accessing is done here, the rest is in
 735    symbols.[hc].  This has been done to keep stringbufs and the
 736    internals of strings and string-like objects confined to this file.
 737 */
 738
 739 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 740
 741 SCM
 742 scm_i_make_symbol (SCM name, scm_t_bits flags,
 743                    unsigned long hash, SCM props)
 744 {
 745   SCM buf;
 746   size_t start = STRING_START (name);
 747   size_t length = STRING_LENGTH (name);
 748
 749   if (IS_SH_STRING (name))
 750     {
 751       name = SH_STRING_STRING (name);
 752       start += STRING_START (name);
 753     }
 754   buf = STRING_STRINGBUF (name);
 755
 756   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 757     {
 758       /* reuse buf. */
 759       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 760       SET_STRINGBUF_SHARED (buf);
 761       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 762     }
 763   else
 764     {
 765       /* make new buf. */
 766       if (scm_i_is_narrow_string (name))
 767         {
 768           SCM new_buf = make_stringbuf (length);
 769           memcpy (STRINGBUF_CHARS (new_buf),
 770                   STRINGBUF_CHARS (buf) + start, length);
 771           buf = new_buf;
 772         }
 773       else
 774         {
 775           SCM new_buf = make_wide_stringbuf (length);
 776           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 777                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 778                    length);
 779           buf = new_buf;
 780         }
 781     }
 782   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 783                           (scm_t_bits) hash, SCM_UNPACK (props));
 784 }
 785
 786 SCM
 787 scm_i_c_make_symbol (const char *name, size_t len,
 788                      scm_t_bits flags, unsigned long hash, SCM props)
 789 {
 790   SCM buf = make_stringbuf (len);
 791   memcpy (STRINGBUF_CHARS (buf), name, len);
 792
 793   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 794                           (scm_t_bits) hash, SCM_UNPACK (props));
 795 }
 796
 797 /* Returns the number of characters in SYM.  This may be different
 798    from the memory size of SYM.  */
 799 size_t
 800 scm_i_symbol_length (SCM sym)
 801 {
 802   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 803 }
 804
 805 size_t
 806 scm_c_symbol_length (SCM sym)
 807 #define FUNC_NAME "scm_c_symbol_length"
 808 {
 809   SCM_VALIDATE_SYMBOL (1, sym);
 810
 811   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 812 }
 813 #undef FUNC_NAME
 814
 815 /* True if the name of SYM is stored as a Latin-1 encoded string.
 816    False if it is stored as a 32-bit UCS-4-encoded string.  */
 817 int
 818 scm_i_is_narrow_symbol (SCM sym)
 819 {
 820   SCM buf;
 821
 822   buf = SYMBOL_STRINGBUF (sym);
 823   return !STRINGBUF_WIDE (buf);
 824 }
 825
 826 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 827    contains the name of SYM.  */
 828 const char *
 829 scm_i_symbol_chars (SCM sym)
 830 {
 831   SCM buf;
 832
 833   buf = SYMBOL_STRINGBUF (sym);
 834   if (!STRINGBUF_WIDE (buf))
 835     return (const char *) STRINGBUF_CHARS (buf);
 836   else
 837     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 838                     scm_list_1 (sym));
 839 }
 840
 841 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 842    symbol's name.  */
 843 const scm_t_wchar *
 844 scm_i_symbol_wide_chars (SCM sym)
 845 {
 846   SCM buf;
 847
 848   buf = SYMBOL_STRINGBUF (sym);
 849   if (STRINGBUF_WIDE (buf))
 850     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 851   else
 852     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 853                     scm_list_1 (sym));
 854 }
 855
 856 SCM
 857 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 858 {
 859   SCM buf = SYMBOL_STRINGBUF (sym);
 860   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 861   SET_STRINGBUF_SHARED (buf);
 862   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 863   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 864                           (scm_t_bits)start, (scm_t_bits) end - start);
 865 }
 866
 867 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 868 scm_t_wchar
 869 scm_i_symbol_ref (SCM sym, size_t x)
 870 {
 871   if (scm_i_is_narrow_symbol (sym))
 872     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 873   else
 874     return scm_i_symbol_wide_chars (sym)[x];
 875 }
 876
 877 /* Debugging
 878  */
 879
 880 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 881             "Returns an association list containing debugging information\n"
 882             "for @var{str}. The association list has the following entries."
 883             "@table @code\n"
 884             "@item string\n"
 885             "The string itself.\n"
 886             "@item start\n"
 887             "The start index of the string into its stringbuf\n"
 888             "@item length\n"
 889             "The length of the string\n"
 890             "@item shared\n"
 891             "If this string is a substring, it returns its parent string.\n"
 892             "Otherwise, it returns @code{#f}\n"
 893             "@item read-only\n"
 894             "@code{#t} if the string is read-only\n"
 895             "@item stringbuf-chars\n"
 896             "A new string containing this string's stringbuf's characters\n"
 897             "@item stringbuf-length\n"
 898             "The number of characters in this stringbuf\n"
 899             "@item stringbuf-shared\n"
 900             "@code{#t} if this stringbuf is shared\n"
 901             "@item stringbuf-wide\n"
 902             "@code{#t} if this stringbuf's characters are stored in a\n"
 903             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 904             "buffer\n"
 905             "@end table")
 906 #define FUNC_NAME s_scm_sys_string_dump
 907 {
 908   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 909   SCM buf;
 910   SCM_VALIDATE_STRING (1, str);
 911
 912   /* String info */
 913   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 914                  str);
 915   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 916                  scm_from_size_t (STRING_START (str)));
 917   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 918                  scm_from_size_t (STRING_LENGTH (str)));
 919
 920   if (IS_SH_STRING (str))
 921     {
 922       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 923                      SH_STRING_STRING (str));
 924       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 925     }
 926   else
 927     {
 928       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 929                      SCM_BOOL_F);
 930       buf = STRING_STRINGBUF (str);
 931     }
 932
 933   if (IS_RO_STRING (str))
 934     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 935                    SCM_BOOL_T);
 936   else
 937     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 938                    SCM_BOOL_F);
 939
 940   /* Stringbuf info */
 941   if (!STRINGBUF_WIDE (buf))
 942     {
 943       size_t len = STRINGBUF_LENGTH (buf);
 944       char *cbuf;
 945       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 946       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 947       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 948                      sbc);
 949     }
 950   else
 951     {
 952       size_t len = STRINGBUF_LENGTH (buf);
 953       scm_t_wchar *cbuf;
 954       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 955       u32_cpy ((scm_t_uint32 *) cbuf,
 956                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 957       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 958                      sbc);
 959     }
 960   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 961                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 962   if (STRINGBUF_SHARED (buf))
 963     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 964                    SCM_BOOL_T);
 965   else
 966     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 967                    SCM_BOOL_F);
 968   if (STRINGBUF_WIDE (buf))
 969     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 970                    SCM_BOOL_T);
 971   else
 972     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 973                    SCM_BOOL_F);
 974
 975   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 976 }
 977 #undef FUNC_NAME
 978
 979 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 980             "Returns an association list containing debugging information\n"
 981             "for @var{sym}. The association list has the following entries."
 982             "@table @code\n"
 983             "@item symbol\n"
 984             "The symbol itself\n"
 985             "@item hash\n"
 986             "Its hash value\n"
 987             "@item interned\n"
 988             "@code{#t} if it is an interned symbol\n"
 989             "@item stringbuf-chars\n"
 990             "A new string containing this symbols's stringbuf's characters\n"
 991             "@item stringbuf-length\n"
 992             "The number of characters in this stringbuf\n"
 993             "@item stringbuf-shared\n"
 994             "@code{#t} if this stringbuf is shared\n"
 995             "@item stringbuf-wide\n"
 996             "@code{#t} if this stringbuf's characters are stored in a\n"
 997             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 998             "buffer\n"
 999             "@end table")
1000 #define FUNC_NAME s_scm_sys_symbol_dump
1001 {
1002   SCM e1, e2, e3, e4, e5, e6, e7;
1003   SCM buf;
1004   SCM_VALIDATE_SYMBOL (1, sym);
1005   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1006                  sym);
1007   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1008                  scm_from_ulong (scm_i_symbol_hash (sym)));
1009   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1010                  scm_symbol_interned_p (sym));
1011   buf = SYMBOL_STRINGBUF (sym);
1012
1013   /* Stringbuf info */
1014   if (!STRINGBUF_WIDE (buf))
1015     {
1016       size_t len = STRINGBUF_LENGTH (buf);
1017       char *cbuf;
1018       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1019       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1020       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1021                      sbc);
1022     }
1023   else
1024     {
1025       size_t len = STRINGBUF_LENGTH (buf);
1026       scm_t_wchar *cbuf;
1027       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1028       u32_cpy ((scm_t_uint32 *) cbuf,
1029                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1030       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1031                      sbc);
1032     }
1033   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1034                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1035   if (STRINGBUF_SHARED (buf))
1036     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1037                    SCM_BOOL_T);
1038   else
1039     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1040                    SCM_BOOL_F);
1041   if (STRINGBUF_WIDE (buf))
1042     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1043                     SCM_BOOL_T);
1044   else
1045     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1046                     SCM_BOOL_F);
1047   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1048
1049 }
1050 #undef FUNC_NAME
1051
1052 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1053
1054 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1055 #define FUNC_NAME s_scm_sys_stringbuf_hist
1056 {
1057   int i;
1058   for (i = 0; i < 1000; i++)
1059     if (lenhist[i])
1060       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1061   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1062   return SCM_UNSPECIFIED;
1063 }
1064 #undef FUNC_NAME
1065
1066 #endif
1067
1068 \f
1069
1070 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1071             (SCM obj),
1072             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1073 #define FUNC_NAME s_scm_string_p
1074 {
1075   return scm_from_bool (IS_STRING (obj));
1076 }
1077 #undef FUNC_NAME
1078
1079
1080 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1081
1082 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1083             (SCM chrs),
1084             "@deffnx {Scheme Procedure} list->string chrs\n"
1085             "Return a newly allocated string composed of the arguments,\n"
1086             "@var{chrs}.")
1087 #define FUNC_NAME s_scm_string
1088 {
1089   SCM result = SCM_BOOL_F;
1090   SCM rest;
1091   size_t len;
1092   size_t p = 0;
1093   long i;
1094   int wide = 0;
1095
1096   /* Verify that this is a list of chars.  */
1097   i = scm_ilength (chrs);
1098   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1099
1100   len = (size_t) i;
1101   rest = chrs;
1102
1103   while (len > 0 && scm_is_pair (rest))
1104     {
1105       SCM elt = SCM_CAR (rest);
1106       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1107       if (SCM_CHAR (elt) > 0xFF)
1108         wide = 1;
1109       rest = SCM_CDR (rest);
1110       len--;
1111       scm_remember_upto_here_1 (elt);
1112     }
1113
1114   /* Construct a string containing this list of chars.  */
1115   len = (size_t) i;
1116   rest = chrs;
1117
1118   if (wide == 0)
1119     {
1120       char *buf;
1121
1122       result = scm_i_make_string (len, NULL, 0);
1123       result = scm_i_string_start_writing (result);
1124       buf = scm_i_string_writable_chars (result);
1125       while (len > 0 && scm_is_pair (rest))
1126         {
1127           SCM elt = SCM_CAR (rest);
1128           buf[p] = (unsigned char) SCM_CHAR (elt);
1129           p++;
1130           rest = SCM_CDR (rest);
1131           len--;
1132           scm_remember_upto_here_1 (elt);
1133         }
1134     }
1135   else
1136     {
1137       scm_t_wchar *buf;
1138
1139       result = scm_i_make_wide_string (len, NULL, 0);
1140       result = scm_i_string_start_writing (result);
1141       buf = scm_i_string_writable_wide_chars (result);
1142       while (len > 0 && scm_is_pair (rest))
1143         {
1144           SCM elt = SCM_CAR (rest);
1145           buf[p] = SCM_CHAR (elt);
1146           p++;
1147           rest = SCM_CDR (rest);
1148           len--;
1149           scm_remember_upto_here_1 (elt);
1150         }
1151     }
1152   scm_i_string_stop_writing ();
1153
1154   if (len > 0)
1155     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1156   if (!scm_is_null (rest))
1157     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1158
1159   return result;
1160 }
1161 #undef FUNC_NAME
1162
1163 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1164             (SCM k, SCM chr),
1165             "Return a newly allocated string of\n"
1166             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1167             "the string are initialized to @var{chr}, otherwise the contents\n"
1168             "of the string are all set to @code{#\nul}.")
1169 #define FUNC_NAME s_scm_make_string
1170 {
1171   return scm_c_make_string (scm_to_size_t (k), chr);
1172 }
1173 #undef FUNC_NAME
1174
1175 SCM
1176 scm_c_make_string (size_t len, SCM chr)
1177 #define FUNC_NAME NULL
1178 {
1179   size_t p;
1180   char *contents = NULL;
1181   SCM res = scm_i_make_string (len, &contents, 0);
1182
1183   /* If no char is given, initialize string contents to NULL.  */
1184   if (SCM_UNBNDP (chr))
1185     memset (contents, 0, len);
1186   else
1187     {
1188       SCM_VALIDATE_CHAR (0, chr);
1189       res = scm_i_string_start_writing (res);
1190       for (p = 0; p < len; p++)
1191         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1192       scm_i_string_stop_writing ();
1193     }
1194
1195   return res;
1196 }
1197 #undef FUNC_NAME
1198
1199 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1200             (SCM string),
1201             "Return the number of characters in @var{string}.")
1202 #define FUNC_NAME s_scm_string_length
1203 {
1204   SCM_VALIDATE_STRING (1, string);
1205   return scm_from_size_t (STRING_LENGTH (string));
1206 }
1207 #undef FUNC_NAME
1208
1209 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1210             (SCM string),
1211             "Return the bytes used to represent a character in @var{string}."
1212             "This will return 1 or 4.")
1213 #define FUNC_NAME s_scm_string_bytes_per_char
1214 {
1215   SCM_VALIDATE_STRING (1, string);
1216   if (!scm_i_is_narrow_string (string))
1217     return scm_from_int (4);
1218
1219   return scm_from_int (1);
1220 }
1221 #undef FUNC_NAME
1222
1223 size_t
1224 scm_c_string_length (SCM string)
1225 {
1226   if (!IS_STRING (string))
1227     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1228   return STRING_LENGTH (string);
1229 }
1230
1231 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1232             (SCM str, SCM k),
1233             "Return character @var{k} of @var{str} using zero-origin\n"
1234             "indexing. @var{k} must be a valid index of @var{str}.")
1235 #define FUNC_NAME s_scm_string_ref
1236 {
1237   size_t len;
1238   unsigned long idx;
1239
1240   SCM_VALIDATE_STRING (1, str);
1241
1242   len = scm_i_string_length (str);
1243   if (SCM_LIKELY (len > 0))
1244     idx = scm_to_unsigned_integer (k, 0, len - 1);
1245   else
1246     scm_out_of_range (NULL, k);
1247
1248   if (scm_i_is_narrow_string (str))
1249     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1250   else
1251     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1252 }
1253 #undef FUNC_NAME
1254
1255 SCM
1256 scm_c_string_ref (SCM str, size_t p)
1257 {
1258   if (p >= scm_i_string_length (str))
1259     scm_out_of_range (NULL, scm_from_size_t (p));
1260   if (scm_i_is_narrow_string (str))
1261     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1262   else
1263     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1264
1265 }
1266
1267 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1268             (SCM str, SCM k, SCM chr),
1269             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1270             "an unspecified value. @var{k} must be a valid index of\n"
1271             "@var{str}.")
1272 #define FUNC_NAME s_scm_string_set_x
1273 {
1274   size_t len;
1275   unsigned long idx;
1276
1277   SCM_VALIDATE_STRING (1, str);
1278
1279   len = scm_i_string_length (str);
1280   if (SCM_LIKELY (len > 0))
1281     idx = scm_to_unsigned_integer (k, 0, len - 1);
1282   else
1283     scm_out_of_range (NULL, k);
1284
1285   SCM_VALIDATE_CHAR (3, chr);
1286   str = scm_i_string_start_writing (str);
1287   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1288   scm_i_string_stop_writing ();
1289
1290   return SCM_UNSPECIFIED;
1291 }
1292 #undef FUNC_NAME
1293
1294 void
1295 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1296 {
1297   if (p >= scm_i_string_length (str))
1298     scm_out_of_range (NULL, scm_from_size_t (p));
1299   str = scm_i_string_start_writing (str);
1300   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1301   scm_i_string_stop_writing ();
1302 }
1303
1304 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1305             (SCM str, SCM start, SCM end),
1306             "Return a newly allocated string formed from the characters\n"
1307             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308             "ending with index @var{end} (exclusive).\n"
1309             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310             "exact integers satisfying:\n\n"
1311             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring
1313 {
1314   size_t len, from, to;
1315
1316   SCM_VALIDATE_STRING (1, str);
1317   len = scm_i_string_length (str);
1318   from = scm_to_unsigned_integer (start, 0, len);
1319   if (SCM_UNBNDP (end))
1320     to = len;
1321   else
1322     to = scm_to_unsigned_integer (end, from, len);
1323   return scm_i_substring (str, from, to);
1324 }
1325 #undef FUNC_NAME
1326
1327 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1328             (SCM str, SCM start, SCM end),
1329             "Return a newly allocated string formed from the characters\n"
1330             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1331             "ending with index @var{end} (exclusive).\n"
1332             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1333             "exact integers satisfying:\n"
1334             "\n"
1335             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1336             "\n"
1337             "The returned string is read-only.\n")
1338 #define FUNC_NAME s_scm_substring_read_only
1339 {
1340   size_t len, from, to;
1341
1342   SCM_VALIDATE_STRING (1, str);
1343   len = scm_i_string_length (str);
1344   from = scm_to_unsigned_integer (start, 0, len);
1345   if (SCM_UNBNDP (end))
1346     to = len;
1347   else
1348     to = scm_to_unsigned_integer (end, from, len);
1349   return scm_i_substring_read_only (str, from, to);
1350 }
1351 #undef FUNC_NAME
1352
1353 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1354             (SCM str, SCM start, SCM end),
1355             "Return a newly allocated string formed from the characters\n"
1356             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1357             "ending with index @var{end} (exclusive).\n"
1358             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1359             "exact integers satisfying:\n\n"
1360             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1361 #define FUNC_NAME s_scm_substring_copy
1362 {
1363   /* For the Scheme version, START is mandatory, but for the C
1364      version, it is optional.  See scm_string_copy in srfi-13.c for a
1365      rationale.
1366   */
1367
1368   size_t from, to;
1369
1370   SCM_VALIDATE_STRING (1, str);
1371   scm_i_get_substring_spec (scm_i_string_length (str),
1372                             start, &from, end, &to);
1373   return scm_i_substring_copy (str, from, to);
1374 }
1375 #undef FUNC_NAME
1376
1377 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1378             (SCM str, SCM start, SCM end),
1379             "Return string that indirectly refers to the characters\n"
1380             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1381             "ending with index @var{end} (exclusive).\n"
1382             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1383             "exact integers satisfying:\n\n"
1384             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1385 #define FUNC_NAME s_scm_substring_shared
1386 {
1387   size_t len, from, to;
1388
1389   SCM_VALIDATE_STRING (1, str);
1390   len = scm_i_string_length (str);
1391   from = scm_to_unsigned_integer (start, 0, len);
1392   if (SCM_UNBNDP (end))
1393     to = len;
1394   else
1395     to = scm_to_unsigned_integer (end, from, len);
1396   return scm_i_substring_shared (str, from, to);
1397 }
1398 #undef FUNC_NAME
1399
1400 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1401             (SCM args),
1402             "Return a newly allocated string whose characters form the\n"
1403             "concatenation of the given strings, @var{args}.")
1404 #define FUNC_NAME s_scm_string_append
1405 {
1406   SCM res;
1407   size_t total = 0;
1408   size_t len;
1409   int wide = 0;
1410   SCM l, s;
1411   size_t i;
1412   union
1413   {
1414     char *narrow;
1415     scm_t_wchar *wide;
1416   } data;
1417
1418   SCM_VALIDATE_REST_ARGUMENT (args);
1419   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1420     {
1421       s = SCM_CAR (l);
1422       SCM_VALIDATE_STRING (SCM_ARGn, s);
1423       len = scm_i_string_length (s);
1424       if (((size_t) -1) - total < len)
1425         scm_num_overflow (s_scm_string_append);
1426       total += len;
1427       if (!scm_i_is_narrow_string (s))
1428         wide = 1;
1429     }
1430   data.narrow = NULL;
1431   if (!wide)
1432     res = scm_i_make_string (total, &data.narrow, 0);
1433   else
1434     res = scm_i_make_wide_string (total, &data.wide, 0);
1435
1436   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1437     {
1438       size_t len;
1439       s = SCM_CAR (l);
1440       SCM_VALIDATE_STRING (SCM_ARGn, s);
1441       len = scm_i_string_length (s);
1442       if (len > total)
1443         SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1444       if (!wide)
1445         {
1446           memcpy (data.narrow, scm_i_string_chars (s), len);
1447           data.narrow += len;
1448         }
1449       else
1450         {
1451           if (scm_i_is_narrow_string (s))
1452             {
1453               const char *src = scm_i_string_chars (s);
1454               for (i = 0; i < len; i++)
1455                 data.wide[i] = (unsigned char) src[i];
1456             }
1457           else
1458             u32_cpy ((scm_t_uint32 *) data.wide,
1459                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1460           data.wide += len;
1461         }
1462       total -= len;
1463       scm_remember_upto_here_1 (s);
1464     }
1465   if (total != 0)
1466     SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1467   return res;
1468 }
1469 #undef FUNC_NAME
1470
1471
1472 \f
1473 /* Charset conversion error handling.  */
1474
1475 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1476 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1477
1478 /* Raise an exception informing that character CHR could not be written
1479    to PORT in its current encoding.  */
1480 void
1481 scm_encoding_error (const char *subr, int err, const char *message,
1482                     SCM port, SCM chr)
1483 {
1484   scm_throw (scm_encoding_error_key,
1485              scm_list_n (scm_from_latin1_string (subr),
1486                          scm_from_latin1_string (message),
1487                          scm_from_int (err),
1488                          port, chr,
1489                          SCM_UNDEFINED));
1490 }
1491
1492 /* Raise an exception informing of an encoding error on PORT.  This
1493    means that a character could not be written in PORT's encoding.  */
1494 void
1495 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1496 {
1497   scm_throw (scm_decoding_error_key,
1498              scm_list_n (scm_from_latin1_string (subr),
1499                          scm_from_latin1_string (message),
1500                          scm_from_int (err),
1501                          port,
1502                          SCM_UNDEFINED));
1503 }
1504
1505 \f
1506 /* String conversion to/from C.  */
1507
1508 static void
1509 decoding_error (const char *func_name, int errno_save,
1510                 const char *str, size_t len)
1511 {
1512   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1513      handler.  */
1514   SCM bv;
1515   signed char *buf;
1516
1517   buf = scm_gc_malloc_pointerless (len, "bytevector");
1518   memcpy (buf, str, len);
1519   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1520
1521   scm_decoding_error (func_name, errno_save,
1522                       "input locale conversion error", bv);
1523 }
1524
1525 SCM
1526 scm_from_stringn (const char *str, size_t len, const char *encoding,
1527                   scm_t_string_failed_conversion_handler handler)
1528 {
1529   size_t u32len, i;
1530   scm_t_wchar *u32;
1531   int wide = 0;
1532   SCM res;
1533
1534   /* The order of these checks is important. */
1535   if (!str && len != 0)
1536     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1537   if (len == (size_t) -1)
1538     len = strlen (str);
1539
1540   if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
1541     return scm_from_latin1_stringn (str, len);
1542   else if (c_strcasecmp (encoding, "UTF-8") == 0
1543            && handler == SCM_FAILED_CONVERSION_ERROR)
1544     return scm_from_utf8_stringn (str, len);
1545
1546   u32len = 0;
1547   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1548                                                 (enum iconv_ilseq_handler)
1549                                                 handler,
1550                                                 str, len,
1551                                                 NULL,
1552                                                 NULL, &u32len);
1553
1554   if (SCM_UNLIKELY (u32 == NULL))
1555     decoding_error (__func__, errno, str, len);
1556
1557   i = 0;
1558   while (i < u32len)
1559     if (u32[i++] > 0xFF)
1560       {
1561         wide = 1;
1562         break;
1563       }
1564
1565   if (!wide)
1566     {
1567       char *dst;
1568       res = scm_i_make_string (u32len, &dst, 0);
1569       for (i = 0; i < u32len; i ++)
1570         dst[i] = (unsigned char) u32[i];
1571       dst[u32len] = '\0';
1572     }
1573   else
1574     {
1575       scm_t_wchar *wdst;
1576       res = scm_i_make_wide_string (u32len, &wdst, 0);
1577       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1578       wdst[u32len] = 0;
1579     }
1580
1581   free (u32);
1582   return res;
1583 }
1584
1585 SCM
1586 scm_from_locale_string (const char *str)
1587 {
1588   return scm_from_locale_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_locale_stringn (const char *str, size_t len)
1593 {
1594   return scm_from_stringn (str, len, locale_charset (),
1595                            scm_i_default_port_conversion_handler ());
1596 }
1597
1598 SCM
1599 scm_from_latin1_string (const char *str)
1600 {
1601   return scm_from_latin1_stringn (str, -1);
1602 }
1603
1604 SCM
1605 scm_from_latin1_stringn (const char *str, size_t len)
1606 {
1607   char *buf;
1608   SCM result;
1609
1610   if (len == (size_t) -1)
1611     len = strlen (str);
1612
1613   /* Make a narrow string and copy STR as is.  */
1614   result = scm_i_make_string (len, &buf, 0);
1615   memcpy (buf, str, len);
1616
1617   return result;
1618 }
1619
1620 SCM
1621 scm_from_utf8_string (const char *str)
1622 {
1623   return scm_from_utf8_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_utf8_stringn (const char *str, size_t len)
1628 {
1629   size_t i, char_len;
1630   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1631   int ascii = 1, narrow = 1;
1632   SCM res;
1633
1634   if (len == (size_t) -1)
1635     len = strlen (str);
1636
1637   i = 0;
1638   char_len = 0;
1639
1640   while (i < len)
1641     {
1642       if (ustr[i] <= 127)
1643         {
1644           char_len++;
1645           i++;
1646         }
1647       else
1648         {
1649           ucs4_t c;
1650           int nbytes;
1651
1652           ascii = 0;
1653
1654           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1655
1656           if (c == 0xfffd)
1657             /* Bad UTF-8.  */
1658             decoding_error (__func__, errno, str, len);
1659
1660           if (c > 255)
1661             narrow = 0;
1662
1663           char_len++;
1664           i += nbytes;
1665         }
1666     }
1667
1668   if (ascii)
1669     {
1670       char *dst;
1671       res = scm_i_make_string (char_len, &dst, 0);
1672       memcpy (dst, str, len);
1673     }
1674   else if (narrow)
1675     {
1676       char *dst;
1677       size_t j;
1678       ucs4_t c;
1679
1680       res = scm_i_make_string (char_len, &dst, 0);
1681
1682       for (i = 0, j = 0; i < len; j++)
1683         {
1684           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1685           dst[j] = (signed char) c;
1686         }
1687     }
1688   else
1689     {
1690       scm_t_wchar *dst;
1691       size_t j;
1692       ucs4_t c;
1693
1694       res = scm_i_make_wide_string (char_len, &dst, 0);
1695
1696       for (i = 0, j = 0; i < len; j++)
1697         {
1698           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1699           dst[j] = c;
1700         }
1701     }
1702
1703   return res;
1704 }
1705
1706 SCM
1707 scm_from_utf32_string (const scm_t_wchar *str)
1708 {
1709   return scm_from_utf32_stringn (str, -1);
1710 }
1711
1712 SCM
1713 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1714 {
1715   SCM result;
1716   scm_t_wchar *buf;
1717
1718   if (len == (size_t) -1)
1719     len = u32_strlen ((uint32_t *) str);
1720
1721   result = scm_i_make_wide_string (len, &buf, 0);
1722   memcpy (buf, str, len * sizeof (scm_t_wchar));
1723   scm_i_try_narrow_string (result);
1724
1725   return result;
1726 }
1727
1728 SCM
1729 scm_from_port_string (const char *str, SCM port)
1730 {
1731   return scm_from_port_stringn (str, -1, port);
1732 }
1733
1734 SCM
1735 scm_from_port_stringn (const char *str, size_t len, SCM port)
1736 {
1737   scm_t_port *pt = SCM_PTAB_ENTRY (port);
1738   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
1739
1740   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1741     return scm_from_latin1_stringn (str, len);
1742   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1743            && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
1744     return scm_from_utf8_stringn (str, len);
1745   else
1746     return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1747 }
1748
1749 /* Create a new scheme string from the C string STR.  The memory of
1750    STR may be used directly as storage for the new string.  */
1751 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1752    would be to register a finalizer to eventually free(3) STR, which isn't
1753    worth it.  Should we just deprecate the `scm_take_' functions?  */
1754 SCM
1755 scm_take_locale_stringn (char *str, size_t len)
1756 {
1757   SCM res;
1758
1759   res = scm_from_locale_stringn (str, len);
1760   free (str);
1761
1762   return res;
1763 }
1764
1765 SCM
1766 scm_take_locale_string (char *str)
1767 {
1768   return scm_take_locale_stringn (str, -1);
1769 }
1770
1771 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1772    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1773    Set *LENP to the size of the resulting string.
1774
1775    FIXME: This is a hack we should get rid of.  See
1776    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1777    for details.  */
1778 static void
1779 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1780 {
1781   char *before, *after;
1782   size_t i, j;
1783
1784   before = buf;
1785   after = buf;
1786   i = 0;
1787   j = 0;
1788   while (i < *lenp)
1789     {
1790       if ((i <= *lenp - 6)
1791           && before[i] == '\\'
1792           && before[i + 1] == 'u'
1793           && before[i + 2] == '0' && before[i + 3] == '0')
1794         {
1795           /* Convert \u00NN to \xNN */
1796           after[j] = '\\';
1797           after[j + 1] = 'x';
1798           after[j + 2] = tolower ((int) before[i + 4]);
1799           after[j + 3] = tolower ((int) before[i + 5]);
1800           i += 6;
1801           j += 4;
1802         }
1803       else if ((i <= *lenp - 10)
1804                && before[i] == '\\'
1805                && before[i + 1] == 'U'
1806                && before[i + 2] == '0' && before[i + 3] == '0')
1807         {
1808           /* Convert \U00NNNNNN to \UNNNNNN */
1809           after[j] = '\\';
1810           after[j + 1] = 'U';
1811           after[j + 2] = tolower ((int) before[i + 4]);
1812           after[j + 3] = tolower ((int) before[i + 5]);
1813           after[j + 4] = tolower ((int) before[i + 6]);
1814           after[j + 5] = tolower ((int) before[i + 7]);
1815           after[j + 6] = tolower ((int) before[i + 8]);
1816           after[j + 7] = tolower ((int) before[i + 9]);
1817           i += 10;
1818           j += 8;
1819         }
1820       else
1821         {
1822           after[j] = before[i];
1823           i++;
1824           j++;
1825         }
1826     }
1827   *lenp = j;
1828 }
1829
1830 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1831    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1832    of the resulting string.  BUF must be large enough to handle the
1833    worst case when `\uXXXX' escapes (6 characters) are replaced by
1834    `\xXXXX;' (7 characters).  */
1835 static void
1836 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1837 {
1838   char *before, *after;
1839   size_t i, j;
1840   /* The worst case is if the input string contains all 4-digit hex escapes.
1841      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1842   size_t max_out_len = (*lenp * 7) / 6 + 1;
1843   size_t nzeros, ndigits;
1844
1845   before = buf;
1846   after = alloca (max_out_len);
1847   i = 0;
1848   j = 0;
1849   while (i < *lenp)
1850     {
1851       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1852           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1853         {
1854           if (before[i + 1] == 'u')
1855             ndigits = 4;
1856           else if (before[i + 1] == 'U')
1857             ndigits = 8;
1858           else
1859             abort ();
1860
1861           /* Add the R6RS hex escape initial sequence.  */
1862           after[j] = '\\';
1863           after[j + 1] = 'x';
1864
1865           /* Move string positions to the start of the hex numbers.  */
1866           i += 2;
1867           j += 2;
1868
1869           /* Find the number of initial zeros in this hex number.  */
1870           nzeros = 0;
1871           while (before[i + nzeros] == '0' && nzeros < ndigits)
1872             nzeros++;
1873
1874           /* Copy the number, skipping initial zeros, and then move the string
1875              positions.  */
1876           if (nzeros == ndigits)
1877             {
1878               after[j] = '0';
1879               i += ndigits;
1880               j += 1;
1881             }
1882           else
1883             {
1884               int pos;
1885               for (pos = 0; pos < ndigits - nzeros; pos++)
1886                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1887               i += ndigits;
1888               j += (ndigits - nzeros);
1889             }
1890
1891           /* Add terminating semicolon.  */
1892           after[j] = ';';
1893           j++;
1894         }
1895       else
1896         {
1897           after[j] = before[i];
1898           i++;
1899           j++;
1900         }
1901     }
1902   *lenp = j;
1903   memcpy (before, after, j);
1904 }
1905
1906 char *
1907 scm_to_locale_string (SCM str)
1908 {
1909   return scm_to_locale_stringn (str, NULL);
1910 }
1911
1912 char *
1913 scm_to_locale_stringn (SCM str, size_t *lenp)
1914 {
1915   return scm_to_stringn (str, lenp,
1916                          locale_charset (),
1917                          scm_i_default_port_conversion_handler ());
1918 }
1919
1920 char *
1921 scm_to_latin1_string (SCM str)
1922 {
1923   return scm_to_latin1_stringn (str, NULL);
1924 }
1925
1926 char *
1927 scm_to_latin1_stringn (SCM str, size_t *lenp)
1928 #define FUNC_NAME "scm_to_latin1_stringn"
1929 {
1930   char *result;
1931
1932   SCM_VALIDATE_STRING (1, str);
1933
1934   if (scm_i_is_narrow_string (str))
1935     {
1936       size_t len = scm_i_string_length (str);
1937
1938       if (lenp)
1939         *lenp = len;
1940
1941       result = scm_strndup (scm_i_string_data (str), len);
1942     }
1943   else
1944     result = scm_to_stringn (str, lenp, NULL,
1945                              SCM_FAILED_CONVERSION_ERROR);
1946
1947   return result;
1948 }
1949 #undef FUNC_NAME
1950
1951 char *
1952 scm_to_utf8_string (SCM str)
1953 {
1954   return scm_to_utf8_stringn (str, NULL);
1955 }
1956
1957 static size_t
1958 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1959 {
1960   size_t ret, i;
1961   for (i = 0, ret = 0; i < len; i++)
1962     ret += (str[i] < 128) ? 1 : 2;
1963   return ret;
1964 }
1965
1966 static scm_t_uint8*
1967 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1968               scm_t_uint8 *u8_result, size_t *u8_lenp)
1969 {
1970   size_t i, n;
1971   size_t u8_len = latin1_u8_strlen (str, latin_len);
1972
1973   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1974     u8_result = scm_malloc (u8_len + 1);
1975   if (u8_lenp)
1976     *u8_lenp = u8_len;
1977
1978   for (i = 0, n = 0; i < latin_len; i++)
1979     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1980   if (n != u8_len)
1981     abort ();
1982   u8_result[n] = 0;
1983
1984   return u8_result;
1985 }
1986
1987 /* UTF-8 code table
1988
1989    (Note that this includes code points that are not allowed by Unicode,
1990     but since this function has no way to report an error, and its
1991     purpose is to determine the size of destination buffers for
1992     libunicode conversion functions, we err on the safe side and handle
1993     everything that libunicode might conceivably handle, now or in the
1994     future.)
1995
1996    Char. number range  |        UTF-8 octet sequence
1997       (hexadecimal)    |              (binary)
1998    --------------------+------------------------------------------------------
1999    0000 0000-0000 007F | 0xxxxxxx
2000    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2001    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2002    0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2003    0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2004    0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2005 */
2006
2007 static size_t
2008 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2009 {
2010   size_t ret, i;
2011
2012   for (i = 0, ret = 0; i < len; i++)
2013     {
2014       scm_t_uint32 c = str[i];
2015
2016       if (c <= 0x7f)
2017         ret += 1;
2018       else if (c <= 0x7ff)
2019         ret += 2;
2020       else if (c <= 0xffff)
2021         ret += 3;
2022       else if (c <= 0x1fffff)
2023         ret += 4;
2024       else if (c <= 0x3ffffff)
2025         ret += 5;
2026       else
2027         ret += 6;
2028     }
2029
2030   return ret;
2031 }
2032
2033 char *
2034 scm_to_utf8_stringn (SCM str, size_t *lenp)
2035 #define FUNC_NAME "scm_to_utf8_stringn"
2036 {
2037   SCM_VALIDATE_STRING (1, str);
2038
2039   if (scm_i_is_narrow_string (str))
2040     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2041                                   scm_i_string_length (str),
2042                                   NULL, lenp);
2043   else
2044     {
2045       scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2046       scm_t_uint8 *buf, *ret;
2047       size_t num_chars = scm_i_string_length (str);
2048       size_t num_bytes_predicted, num_bytes_actual;
2049
2050       num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2051
2052       if (lenp)
2053         {
2054           *lenp = num_bytes_predicted;
2055           buf = scm_malloc (num_bytes_predicted);
2056         }
2057       else
2058         {
2059           buf = scm_malloc (num_bytes_predicted + 1);
2060           buf[num_bytes_predicted] = 0;
2061         }
2062
2063       num_bytes_actual = num_bytes_predicted;
2064       ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2065
2066       if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2067         return (char *) ret;
2068
2069       /* An error: a bad codepoint.  */
2070       {
2071         int saved_errno = errno;
2072
2073         free (buf);
2074         if (!saved_errno)
2075           abort ();
2076
2077         scm_decoding_error ("scm_to_utf8_stringn", errno,
2078                             "invalid codepoint in string", str);
2079
2080         /* Not reached.  */
2081         return NULL;
2082       }
2083     }
2084 }
2085 #undef FUNC_NAME
2086
2087 scm_t_wchar *
2088 scm_to_utf32_string (SCM str)
2089 {
2090   return scm_to_utf32_stringn (str, NULL);
2091 }
2092
2093 scm_t_wchar *
2094 scm_to_utf32_stringn (SCM str, size_t *lenp)
2095 #define FUNC_NAME "scm_to_utf32_stringn"
2096 {
2097   scm_t_wchar *result;
2098
2099   SCM_VALIDATE_STRING (1, str);
2100
2101   if (scm_i_is_narrow_string (str))
2102     {
2103       scm_t_uint8 *codepoints;
2104       size_t i, len;
2105
2106       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2107       len = scm_i_string_length (str);
2108       if (lenp)
2109         *lenp = len;
2110
2111       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2112       for (i = 0; i < len; i++)
2113         result[i] = codepoints[i];
2114       result[len] = 0;
2115     }
2116   else
2117     {
2118       size_t len;
2119
2120       len = scm_i_string_length (str);
2121       if (lenp)
2122         *lenp = len;
2123
2124       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2125       memcpy (result, scm_i_string_wide_chars (str),
2126               len * sizeof (scm_t_wchar));
2127       result[len] = 0;
2128     }
2129
2130   return result;
2131 }
2132 #undef FUNC_NAME
2133
2134 char *
2135 scm_to_port_string (SCM str, SCM port)
2136 {
2137   return scm_to_port_stringn (str, NULL, port);
2138 }
2139
2140 char *
2141 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2142 {
2143   scm_t_port *pt = SCM_PTAB_ENTRY (port);
2144   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
2145
2146   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2147       && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2148     return scm_to_latin1_stringn (str, lenp);
2149   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2150     return scm_to_utf8_stringn (str, lenp);
2151   else
2152     return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2153 }
2154
2155 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2156    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2157    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2158    defined by HANDLER.  */
2159 char *
2160 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2161                 scm_t_string_failed_conversion_handler handler)
2162 {
2163   char *buf;
2164   size_t ilen, len, i;
2165   int ret;
2166   const char *enc;
2167
2168   if (!scm_is_string (str))
2169     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2170   ilen = scm_i_string_length (str);
2171
2172   if (ilen == 0)
2173     {
2174       buf = scm_malloc (1);
2175       buf[0] = '\0';
2176       if (lenp)
2177         *lenp = 0;
2178       return buf;
2179     }
2180
2181   if (lenp == NULL)
2182     for (i = 0; i < ilen; i++)
2183       if (scm_i_string_ref (str, i) == '\0')
2184         scm_misc_error (NULL,
2185                         "string contains #\\nul character: ~S",
2186                         scm_list_1 (str));
2187
2188   if (scm_i_is_narrow_string (str)
2189       && c_strcasecmp (encoding, "ISO-8859-1") == 0)
2190     {
2191       /* If using native Latin-1 encoding, just copy the string
2192          contents.  */
2193       if (lenp)
2194         {
2195           buf = scm_malloc (ilen);
2196           memcpy (buf, scm_i_string_chars (str), ilen);
2197           *lenp = ilen;
2198           return buf;
2199         }
2200       else
2201         {
2202           buf = scm_malloc (ilen + 1);
2203           memcpy (buf, scm_i_string_chars (str), ilen);
2204           buf[ilen] = '\0';
2205           return buf;
2206         }
2207     }
2208
2209
2210   buf = NULL;
2211   len = 0;
2212   enc = encoding;
2213   if (enc == NULL)
2214     enc = "ISO-8859-1";
2215   if (scm_i_is_narrow_string (str))
2216     {
2217       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2218                          "ISO-8859-1", enc,
2219                          (enum iconv_ilseq_handler) handler, NULL,
2220                          &buf, &len);
2221
2222       if (ret != 0)
2223         scm_encoding_error (__func__, errno,
2224                             "cannot convert narrow string to output locale",
2225                             SCM_BOOL_F,
2226                             /* FIXME: Faulty character unknown.  */
2227                             SCM_BOOL_F);
2228     }
2229   else
2230     {
2231       buf = u32_conv_to_encoding (enc,
2232                                   (enum iconv_ilseq_handler) handler,
2233                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2234                                   ilen,
2235                                   NULL,
2236                                   NULL, &len);
2237       if (buf == NULL)
2238         scm_encoding_error (__func__, errno,
2239                             "cannot convert wide string to output locale",
2240                             SCM_BOOL_F,
2241                             /* FIXME: Faulty character unknown.  */
2242                             SCM_BOOL_F);
2243     }
2244   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2245     {
2246       if (SCM_R6RS_ESCAPES_P)
2247         {
2248           /* The worst case is if the input string contains all 4-digit
2249              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2250              (seven characters).  Make BUF large enough to hold
2251              that.  */
2252           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2253           unistring_escapes_to_r6rs_escapes (buf, &len);
2254         }
2255       else
2256         unistring_escapes_to_guile_escapes (buf, &len);
2257
2258       buf = scm_realloc (buf, len);
2259     }
2260   if (lenp)
2261     *lenp = len;
2262   else
2263     {
2264       buf = scm_realloc (buf, len + 1);
2265       buf[len] = '\0';
2266     }
2267
2268   scm_remember_upto_here_1 (str);
2269   return buf;
2270 }
2271
2272 size_t
2273 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2274 {
2275   size_t len;
2276   char *result = NULL;
2277   if (!scm_is_string (str))
2278     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2279   result = scm_to_locale_stringn (str, &len);
2280
2281   memcpy (buf, result, (len > max_len) ? max_len : len);
2282   free (result);
2283
2284   scm_remember_upto_here_1 (str);
2285   return len;
2286 }
2287
2288 \f
2289 /* Unicode string normalization.  */
2290
2291 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2292    libguile/i18n.c.  It would be useful to have this factored out into a more
2293    convenient location, but its use of alloca makes that tricky to do. */
2294
2295 static SCM
2296 normalize_str (SCM string, uninorm_t form)
2297 {
2298   SCM ret;
2299   scm_t_uint32 *w_str;
2300   scm_t_wchar *cbuf;
2301   size_t rlen, len = scm_i_string_length (string);
2302
2303   if (scm_i_is_narrow_string (string))
2304     {
2305       size_t i;
2306       const char *buf = scm_i_string_chars (string);
2307
2308       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2309
2310       for (i = 0; i < len; i ++)
2311         w_str[i] = (unsigned char) buf[i];
2312       w_str[len] = 0;
2313     }
2314   else
2315     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2316
2317   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2318
2319   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2320   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2321   free (w_str);
2322
2323   scm_i_try_narrow_string (ret);
2324
2325   return ret;
2326 }
2327
2328 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2329             (SCM string),
2330             "Returns the NFC normalized form of @var{string}.")
2331 #define FUNC_NAME s_scm_string_normalize_nfc
2332 {
2333   SCM_VALIDATE_STRING (1, string);
2334   return normalize_str (string, UNINORM_NFC);
2335 }
2336 #undef FUNC_NAME
2337
2338 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2339             (SCM string),
2340             "Returns the NFD normalized form of @var{string}.")
2341 #define FUNC_NAME s_scm_string_normalize_nfd
2342 {
2343   SCM_VALIDATE_STRING (1, string);
2344   return normalize_str (string, UNINORM_NFD);
2345 }
2346 #undef FUNC_NAME
2347
2348 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2349             (SCM string),
2350             "Returns the NFKC normalized form of @var{string}.")
2351 #define FUNC_NAME s_scm_string_normalize_nfkc
2352 {
2353   SCM_VALIDATE_STRING (1, string);
2354   return normalize_str (string, UNINORM_NFKC);
2355 }
2356 #undef FUNC_NAME
2357
2358 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2359             (SCM string),
2360             "Returns the NFKD normalized form of @var{string}.")
2361 #define FUNC_NAME s_scm_string_normalize_nfkd
2362 {
2363   SCM_VALIDATE_STRING (1, string);
2364   return normalize_str (string, UNINORM_NFKD);
2365 }
2366 #undef FUNC_NAME
2367
2368 /* converts C scm_array of strings to SCM scm_list of strings.
2369    If argc < 0, a null terminated scm_array is assumed.
2370    The current locale encoding is assumed */
2371 SCM
2372 scm_makfromstrs (int argc, char **argv)
2373 {
2374   int i = argc;
2375   SCM lst = SCM_EOL;
2376   if (0 > i)
2377     for (i = 0; argv[i]; i++);
2378   while (i--)
2379     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2380   return lst;
2381 }
2382
2383 /* Return a newly allocated array of char pointers to each of the strings
2384    in args, with a terminating NULL pointer.  The strings are encoded using
2385    the current locale. */
2386
2387 char **
2388 scm_i_allocate_string_pointers (SCM list)
2389 #define FUNC_NAME "scm_i_allocate_string_pointers"
2390 {
2391   char **result;
2392   int list_len = scm_ilength (list);
2393   int i;
2394
2395   if (list_len < 0)
2396     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2397
2398   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2399                           "string pointers");
2400   result[list_len] = NULL;
2401
2402   /* The list might have been modified in another thread, so
2403      we check LIST before each access.
2404    */
2405   for (i = 0; i < list_len && scm_is_pair (list); i++)
2406     {
2407       SCM str = SCM_CAR (list);
2408       size_t len;  /* String length in bytes */
2409       char *c_str = scm_to_locale_stringn (str, &len);
2410
2411       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2412          scm_malloc to allocate the returned string, which must be
2413          explicitly deallocated.  This forces us to copy the string a
2414          second time into a new buffer.  Ideally there would be variants
2415          of scm_to_*_stringn that can return garbage-collected buffers. */
2416
2417       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2418       memcpy (result[i], c_str, len);
2419       result[i][len] = '\0';
2420       free (c_str);
2421
2422       list = SCM_CDR (list);
2423     }
2424
2425   return result;
2426 }
2427 #undef FUNC_NAME
2428
2429 void
2430 scm_i_get_substring_spec (size_t len,
2431                           SCM start, size_t *cstart,
2432                           SCM end, size_t *cend)
2433 {
2434   if (SCM_UNBNDP (start))
2435     *cstart = 0;
2436   else
2437     *cstart = scm_to_unsigned_integer (start, 0, len);
2438
2439   if (SCM_UNBNDP (end))
2440     *cend = len;
2441   else
2442     *cend = scm_to_unsigned_integer (end, *cstart, len);
2443 }
2444
2445 static SCM
2446 string_handle_ref (scm_t_array_handle *h, size_t index)
2447 {
2448   return scm_c_string_ref (h->array, index);
2449 }
2450
2451 static void
2452 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2453 {
2454   scm_c_string_set_x (h->array, index, val);
2455 }
2456
2457 static void
2458 string_get_handle (SCM v, scm_t_array_handle *h)
2459 {
2460   h->array = v;
2461   h->ndims = 1;
2462   h->dims = &h->dim0;
2463   h->dim0.lbnd = 0;
2464   h->dim0.ubnd = scm_c_string_length (v) - 1;
2465   h->dim0.inc = 1;
2466   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2467   h->elements = h->writable_elements = NULL;
2468 }
2469
2470 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2471                           string_handle_ref, string_handle_set,
2472                           string_get_handle)
2473 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2474
2475 void
2476 scm_init_strings ()
2477 {
2478   scm_nullstr = scm_i_make_string (0, NULL, 0);
2479
2480 #include "libguile/strings.x"
2481 }
2482
2483
2484 /*
2485   Local Variables:
2486   c-file-style: "gnu"
2487   End:
2488 */