libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 229 SCM_PTHREAD_ATFORK_LOCK_STATIC_MUTEX (stringbuf_write_mutex);
 230
 231 \f
 232 /* Copy-on-write strings.
 233  */
 234
 235 #define STRING_TAG            scm_tc7_string
 236
 237 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 238 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 239 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 240
 241 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 242 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 243
 244 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 245
 246 /* Read-only strings.
 247  */
 248
 249 #define RO_STRING_TAG         scm_tc7_ro_string
 250 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 251
 252 /* Mutation-sharing substrings
 253  */
 254
 255 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 256
 257 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 258 /* START and LENGTH as for STRINGs. */
 259
 260 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 261
 262 SCM scm_nullstr;
 263
 264 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 265    characters.  CHARSP, if not NULL, will be set to location of the
 266    char array.  If READ_ONLY_P, the returned string is read-only;
 267    otherwise it is writable.  */
 268 SCM
 269 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 270 {
 271   static SCM null_stringbuf = SCM_BOOL_F;
 272   SCM buf;
 273   SCM res;
 274
 275   if (len == 0)
 276     {
 277       if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
 278         {
 279           null_stringbuf = make_stringbuf (0);
 280           SET_STRINGBUF_SHARED (null_stringbuf);
 281         }
 282       buf = null_stringbuf;
 283     }
 284   else
 285     buf = make_stringbuf (len);
 286
 287   if (charsp)
 288     *charsp = (char *) STRINGBUF_CHARS (buf);
 289   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 290                          SCM_UNPACK (buf),
 291                          (scm_t_bits) 0, (scm_t_bits) len);
 292   return res;
 293 }
 294
 295 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 296    characters.  CHARSP, if not NULL, will be set to location of the
 297    character array.  If READ_ONLY_P, the returned string is read-only;
 298    otherwise it is writable.  */
 299 SCM
 300 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 301 {
 302   SCM buf = make_wide_stringbuf (len);
 303   SCM res;
 304   if (charsp)
 305     *charsp = STRINGBUF_WIDE_CHARS (buf);
 306   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 307                          SCM_UNPACK (buf),
 308                          (scm_t_bits) 0, (scm_t_bits) len);
 309   return res;
 310 }
 311
 312 static void
 313 validate_substring_args (SCM str, size_t start, size_t end)
 314 {
 315   if (!IS_STRING (str))
 316     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 317   if (start > STRING_LENGTH (str))
 318     scm_out_of_range (NULL, scm_from_size_t (start));
 319   if (end > STRING_LENGTH (str) || end < start)
 320     scm_out_of_range (NULL, scm_from_size_t (end));
 321 }
 322
 323 static inline void
 324 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 325 {
 326   *start = STRING_START (*str);
 327   if (IS_SH_STRING (*str))
 328     {
 329       *str = SH_STRING_STRING (*str);
 330       *start += STRING_START (*str);
 331     }
 332   *buf = STRING_STRINGBUF (*str);
 333 }
 334
 335 SCM
 336 scm_i_substring (SCM str, size_t start, size_t end)
 337 {
 338   if (start == end)
 339     return scm_i_make_string (0, NULL, 0);
 340   else
 341     {
 342       SCM buf;
 343       size_t str_start;
 344       get_str_buf_start (&str, &buf, &str_start);
 345       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 346       SET_STRINGBUF_SHARED (buf);
 347       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 348       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 349                               (scm_t_bits)str_start + start,
 350                               (scm_t_bits) end - start);
 351     }
 352 }
 353
 354 SCM
 355 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 356 {
 357   if (start == end)
 358     return scm_i_make_string (0, NULL, 1);
 359   else
 360     {
 361       SCM buf;
 362       size_t str_start;
 363       get_str_buf_start (&str, &buf, &str_start);
 364       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 365       SET_STRINGBUF_SHARED (buf);
 366       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 367       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 368                               (scm_t_bits)str_start + start,
 369                               (scm_t_bits) end - start);
 370     }
 371 }
 372
 373 SCM
 374 scm_i_substring_copy (SCM str, size_t start, size_t end)
 375 {
 376   if (start == end)
 377     return scm_i_make_string (0, NULL, 0);
 378   else
 379     {
 380       size_t len = end - start;
 381       SCM buf, my_buf, substr;
 382       size_t str_start;
 383       int wide = 0;
 384       get_str_buf_start (&str, &buf, &str_start);
 385       if (scm_i_is_narrow_string (str))
 386         {
 387           my_buf = make_stringbuf (len);
 388           memcpy (STRINGBUF_CHARS (my_buf),
 389                   STRINGBUF_CHARS (buf) + str_start + start, len);
 390         }
 391       else
 392         {
 393           my_buf = make_wide_stringbuf (len);
 394           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 395                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 396                                      + start), len);
 397           wide = 1;
 398         }
 399       scm_remember_upto_here_1 (buf);
 400       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 401                                 (scm_t_bits) 0, (scm_t_bits) len);
 402       if (wide)
 403         scm_i_try_narrow_string (substr);
 404       return substr;
 405     }
 406 }
 407
 408 SCM
 409 scm_i_substring_shared (SCM str, size_t start, size_t end)
 410 {
 411   if (start == 0 && end == STRING_LENGTH (str))
 412     return str;
 413   else if (start == end)
 414     return scm_i_make_string (0, NULL, 0);
 415   else
 416     {
 417       size_t len = end - start;
 418       if (IS_SH_STRING (str))
 419         {
 420           start += STRING_START (str);
 421           str = SH_STRING_STRING (str);
 422         }
 423       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 424                               (scm_t_bits)start, (scm_t_bits) len);
 425     }
 426 }
 427
 428 SCM
 429 scm_c_substring (SCM str, size_t start, size_t end)
 430 {
 431   validate_substring_args (str, start, end);
 432   return scm_i_substring (str, start, end);
 433 }
 434
 435 SCM
 436 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 437 {
 438   validate_substring_args (str, start, end);
 439   return scm_i_substring_read_only (str, start, end);
 440 }
 441
 442 SCM
 443 scm_c_substring_copy (SCM str, size_t start, size_t end)
 444 {
 445   validate_substring_args (str, start, end);
 446   return scm_i_substring_copy (str, start, end);
 447 }
 448
 449 SCM
 450 scm_c_substring_shared (SCM str, size_t start, size_t end)
 451 {
 452   validate_substring_args (str, start, end);
 453   return scm_i_substring_shared (str, start, end);
 454 }
 455
 456 \f
 457 /* Internal accessors
 458  */
 459
 460 /* Returns the number of characters in STR.  This may be different
 461    than the memory size of the string storage.  */
 462 size_t
 463 scm_i_string_length (SCM str)
 464 {
 465   return STRING_LENGTH (str);
 466 }
 467
 468 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 469    encoding.  False if it is 'wide', having a 32-bit UCS-4
 470    encoding.  */
 471 int
 472 scm_i_is_narrow_string (SCM str)
 473 {
 474   if (IS_SH_STRING (str))
 475     str = SH_STRING_STRING (str);
 476
 477   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 478 }
 479
 480 /* Try to coerce a string to be narrow.  It if is narrow already, do
 481    nothing.  If it is wide, shrink it to narrow if none of its
 482    characters are above 0xFF.  Return true if the string is narrow or
 483    was made to be narrow.  */
 484 int
 485 scm_i_try_narrow_string (SCM str)
 486 {
 487   if (IS_SH_STRING (str))
 488     str = SH_STRING_STRING (str);
 489
 490   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 491
 492   return scm_i_is_narrow_string (str);
 493 }
 494
 495 /* Return a pointer to the raw data of the string, which can be either Latin-1
 496    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 497 const void *
 498 scm_i_string_data (SCM str)
 499 {
 500   SCM buf;
 501   size_t start;
 502   const char *data;
 503
 504   get_str_buf_start (&str, &buf, &start);
 505
 506   data = STRINGBUF_CONTENTS (buf);
 507   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 508
 509   return data;
 510 }
 511
 512 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 513    STR.  */
 514 const char *
 515 scm_i_string_chars (SCM str)
 516 {
 517   SCM buf;
 518   size_t start;
 519   get_str_buf_start (&str, &buf, &start);
 520   if (scm_i_is_narrow_string (str))
 521     return (const char *) STRINGBUF_CHARS (buf) + start;
 522   else
 523     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 524                     scm_list_1 (str));
 525   return NULL;
 526 }
 527
 528 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 529    STR.  */
 530 const scm_t_wchar *
 531 scm_i_string_wide_chars (SCM str)
 532 {
 533   SCM buf;
 534   size_t start;
 535
 536   get_str_buf_start (&str, &buf, &start);
 537   if (!scm_i_is_narrow_string (str))
 538     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 539   else
 540     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 541                     scm_list_1 (str));
 542 }
 543
 544 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 545    a new string buffer, so that it can be modified without modifying
 546    other strings.  Also, lock the string mutex.  Later, one must call
 547    scm_i_string_stop_writing to unlock the mutex.  */
 548 SCM
 549 scm_i_string_start_writing (SCM orig_str)
 550 {
 551   SCM buf, str = orig_str;
 552   size_t start;
 553
 554   get_str_buf_start (&str, &buf, &start);
 555   if (IS_RO_STRING (str))
 556     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 557
 558   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 559   if (STRINGBUF_SHARED (buf))
 560     {
 561       /* Clone the stringbuf.  */
 562       size_t len = STRING_LENGTH (str);
 563       SCM new_buf;
 564
 565       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 566
 567       if (scm_i_is_narrow_string (str))
 568         {
 569           new_buf = make_stringbuf (len);
 570           memcpy (STRINGBUF_CHARS (new_buf),
 571                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 572
 573         }
 574       else
 575         {
 576           new_buf = make_wide_stringbuf (len);
 577           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 578                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 579                                      + STRING_START (str)), len);
 580         }
 581
 582       SET_STRING_STRINGBUF (str, new_buf);
 583       start -= STRING_START (str);
 584
 585       /* FIXME: The following operations are not atomic, so other threads
 586          looking at STR may see an inconsistent state.  Nevertheless it can't
 587          hurt much since (i) accessing STR while it is being mutated can't
 588          yield a crash, and (ii) concurrent accesses to STR should be
 589          protected by a mutex at the application level.  The latter may not
 590          apply when STR != ORIG_STR, though.  */
 591       SET_STRING_START (str, 0);
 592       SET_STRING_STRINGBUF (str, new_buf);
 593
 594       buf = new_buf;
 595
 596       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 597     }
 598   return orig_str;
 599 }
 600
 601 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 602 char *
 603 scm_i_string_writable_chars (SCM str)
 604 {
 605   SCM buf;
 606   size_t start;
 607
 608   get_str_buf_start (&str, &buf, &start);
 609   if (scm_i_is_narrow_string (str))
 610     return (char *) STRINGBUF_CHARS (buf) + start;
 611   else
 612     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 613                     scm_list_1 (str));
 614   return NULL;
 615 }
 616
 617 /* Return a pointer to the UCS-4 codepoints of a string.  */
 618 static scm_t_wchar *
 619 scm_i_string_writable_wide_chars (SCM str)
 620 {
 621   SCM buf;
 622   size_t start;
 623
 624   get_str_buf_start (&str, &buf, &start);
 625   if (!scm_i_is_narrow_string (str))
 626     return STRINGBUF_WIDE_CHARS (buf) + start;
 627   else
 628     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 629                     scm_list_1 (str));
 630 }
 631
 632 /* Unlock the string mutex that was locked when
 633    scm_i_string_start_writing was called.  */
 634 void
 635 scm_i_string_stop_writing (void)
 636 {
 637   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 638 }
 639
 640 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 641 scm_t_wchar
 642 scm_i_string_ref (SCM str, size_t x)
 643 {
 644   if (scm_i_is_narrow_string (str))
 645     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 646   else
 647     return scm_i_string_wide_chars (str)[x];
 648 }
 649
 650 /* Returns index+1 of the first char in STR that matches C, or
 651    0 if the char is not found.  */
 652 int
 653 scm_i_string_contains_char (SCM str, char ch)
 654 {
 655   size_t i;
 656   size_t len = scm_i_string_length (str);
 657
 658   i = 0;
 659   if (scm_i_is_narrow_string (str))
 660     {
 661       while (i < len)
 662         {
 663           if (scm_i_string_chars (str)[i] == ch)
 664             return i+1;
 665           i++;
 666         }
 667     }
 668   else
 669     {
 670       while (i < len)
 671         {
 672           if (scm_i_string_wide_chars (str)[i]
 673               == (unsigned char) ch)
 674             return i+1;
 675           i++;
 676         }
 677     }
 678   return 0;
 679 }
 680
 681 int
 682 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 683 {
 684   if (scm_i_is_narrow_string (sstr))
 685     {
 686       const char *a = scm_i_string_chars (sstr) + start_x;
 687       const char *b = cstr;
 688       return strncmp (a, b, strlen(b));
 689     }
 690   else
 691     {
 692       size_t i;
 693       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 694       const char *b = cstr;
 695       for (i = 0; i < strlen (b); i++)
 696         {
 697           if (a[i] != (unsigned char) b[i])
 698             return 1;
 699         }
 700     }
 701   return 0;
 702 }
 703
 704 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 705 void
 706 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 707 {
 708   if (IS_SH_STRING (str))
 709     {
 710       p += STRING_START (str);
 711       str = SH_STRING_STRING (str);
 712     }
 713
 714   if (chr > 0xFF && scm_i_is_narrow_string (str))
 715     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 716
 717   if (scm_i_is_narrow_string (str))
 718     {
 719       char *dst = scm_i_string_writable_chars (str);
 720       dst[p] = chr;
 721     }
 722   else
 723     {
 724       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 725       dst[p] = chr;
 726     }
 727 }
 728
 729 \f
 730 /* Symbols.
 731
 732    Basic symbol creation and accessing is done here, the rest is in
 733    symbols.[hc].  This has been done to keep stringbufs and the
 734    internals of strings and string-like objects confined to this file.
 735 */
 736
 737 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 738
 739 SCM
 740 scm_i_make_symbol (SCM name, scm_t_bits flags,
 741                    unsigned long hash, SCM props)
 742 {
 743   SCM buf;
 744   size_t start = STRING_START (name);
 745   size_t length = STRING_LENGTH (name);
 746
 747   if (IS_SH_STRING (name))
 748     {
 749       name = SH_STRING_STRING (name);
 750       start += STRING_START (name);
 751     }
 752   buf = SYMBOL_STRINGBUF (name);
 753
 754   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 755     {
 756       /* reuse buf. */
 757       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 758       SET_STRINGBUF_SHARED (buf);
 759       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 760     }
 761   else
 762     {
 763       /* make new buf. */
 764       if (scm_i_is_narrow_string (name))
 765         {
 766           SCM new_buf = make_stringbuf (length);
 767           memcpy (STRINGBUF_CHARS (new_buf),
 768                   STRINGBUF_CHARS (buf) + start, length);
 769           buf = new_buf;
 770         }
 771       else
 772         {
 773           SCM new_buf = make_wide_stringbuf (length);
 774           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 775                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 776                    length);
 777           buf = new_buf;
 778         }
 779     }
 780   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 781                           (scm_t_bits) hash, SCM_UNPACK (props));
 782 }
 783
 784 SCM
 785 scm_i_c_make_symbol (const char *name, size_t len,
 786                      scm_t_bits flags, unsigned long hash, SCM props)
 787 {
 788   SCM buf = make_stringbuf (len);
 789   memcpy (STRINGBUF_CHARS (buf), name, len);
 790
 791   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 792                           (scm_t_bits) hash, SCM_UNPACK (props));
 793 }
 794
 795 /* Returns the number of characters in SYM.  This may be different
 796    from the memory size of SYM.  */
 797 size_t
 798 scm_i_symbol_length (SCM sym)
 799 {
 800   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 801 }
 802
 803 size_t
 804 scm_c_symbol_length (SCM sym)
 805 #define FUNC_NAME "scm_c_symbol_length"
 806 {
 807   SCM_VALIDATE_SYMBOL (1, sym);
 808
 809   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 810 }
 811 #undef FUNC_NAME
 812
 813 /* True if the name of SYM is stored as a Latin-1 encoded string.
 814    False if it is stored as a 32-bit UCS-4-encoded string.  */
 815 int
 816 scm_i_is_narrow_symbol (SCM sym)
 817 {
 818   SCM buf;
 819
 820   buf = SYMBOL_STRINGBUF (sym);
 821   return !STRINGBUF_WIDE (buf);
 822 }
 823
 824 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 825    contains the name of SYM.  */
 826 const char *
 827 scm_i_symbol_chars (SCM sym)
 828 {
 829   SCM buf;
 830
 831   buf = SYMBOL_STRINGBUF (sym);
 832   if (!STRINGBUF_WIDE (buf))
 833     return (const char *) STRINGBUF_CHARS (buf);
 834   else
 835     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 836                     scm_list_1 (sym));
 837 }
 838
 839 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 840    symbol's name.  */
 841 const scm_t_wchar *
 842 scm_i_symbol_wide_chars (SCM sym)
 843 {
 844   SCM buf;
 845
 846   buf = SYMBOL_STRINGBUF (sym);
 847   if (STRINGBUF_WIDE (buf))
 848     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 849   else
 850     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 851                     scm_list_1 (sym));
 852 }
 853
 854 SCM
 855 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 856 {
 857   SCM buf = SYMBOL_STRINGBUF (sym);
 858   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 859   SET_STRINGBUF_SHARED (buf);
 860   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 861   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 862                           (scm_t_bits)start, (scm_t_bits) end - start);
 863 }
 864
 865 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 866 scm_t_wchar
 867 scm_i_symbol_ref (SCM sym, size_t x)
 868 {
 869   if (scm_i_is_narrow_symbol (sym))
 870     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 871   else
 872     return scm_i_symbol_wide_chars (sym)[x];
 873 }
 874
 875 /* Debugging
 876  */
 877
 878 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 879             "Returns an association list containing debugging information\n"
 880             "for @var{str}. The association list has the following entries."
 881             "@table @code\n"
 882             "@item string\n"
 883             "The string itself.\n"
 884             "@item start\n"
 885             "The start index of the string into its stringbuf\n"
 886             "@item length\n"
 887             "The length of the string\n"
 888             "@item shared\n"
 889             "If this string is a substring, it returns its parent string.\n"
 890             "Otherwise, it returns @code{#f}\n"
 891             "@item read-only\n"
 892             "@code{#t} if the string is read-only\n"
 893             "@item stringbuf-chars\n"
 894             "A new string containing this string's stringbuf's characters\n"
 895             "@item stringbuf-length\n"
 896             "The number of characters in this stringbuf\n"
 897             "@item stringbuf-shared\n"
 898             "@code{#t} if this stringbuf is shared\n"
 899             "@item stringbuf-wide\n"
 900             "@code{#t} if this stringbuf's characters are stored in a\n"
 901             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 902             "buffer\n"
 903             "@end table")
 904 #define FUNC_NAME s_scm_sys_string_dump
 905 {
 906   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 907   SCM buf;
 908   SCM_VALIDATE_STRING (1, str);
 909
 910   /* String info */
 911   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 912                  str);
 913   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 914                  scm_from_size_t (STRING_START (str)));
 915   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 916                  scm_from_size_t (STRING_LENGTH (str)));
 917
 918   if (IS_SH_STRING (str))
 919     {
 920       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 921                      SH_STRING_STRING (str));
 922       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 923     }
 924   else
 925     {
 926       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 927                      SCM_BOOL_F);
 928       buf = STRING_STRINGBUF (str);
 929     }
 930
 931   if (IS_RO_STRING (str))
 932     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 933                    SCM_BOOL_T);
 934   else
 935     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 936                    SCM_BOOL_F);
 937
 938   /* Stringbuf info */
 939   if (!STRINGBUF_WIDE (buf))
 940     {
 941       size_t len = STRINGBUF_LENGTH (buf);
 942       char *cbuf;
 943       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 944       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 945       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 946                      sbc);
 947     }
 948   else
 949     {
 950       size_t len = STRINGBUF_LENGTH (buf);
 951       scm_t_wchar *cbuf;
 952       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 953       u32_cpy ((scm_t_uint32 *) cbuf,
 954                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 955       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 956                      sbc);
 957     }
 958   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 959                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 960   if (STRINGBUF_SHARED (buf))
 961     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 962                    SCM_BOOL_T);
 963   else
 964     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 965                    SCM_BOOL_F);
 966   if (STRINGBUF_WIDE (buf))
 967     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 968                    SCM_BOOL_T);
 969   else
 970     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 971                    SCM_BOOL_F);
 972
 973   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 974 }
 975 #undef FUNC_NAME
 976
 977 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 978             "Returns an association list containing debugging information\n"
 979             "for @var{sym}. The association list has the following entries."
 980             "@table @code\n"
 981             "@item symbol\n"
 982             "The symbol itself\n"
 983             "@item hash\n"
 984             "Its hash value\n"
 985             "@item interned\n"
 986             "@code{#t} if it is an interned symbol\n"
 987             "@item stringbuf-chars\n"
 988             "A new string containing this symbols's stringbuf's characters\n"
 989             "@item stringbuf-length\n"
 990             "The number of characters in this stringbuf\n"
 991             "@item stringbuf-shared\n"
 992             "@code{#t} if this stringbuf is shared\n"
 993             "@item stringbuf-wide\n"
 994             "@code{#t} if this stringbuf's characters are stored in a\n"
 995             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 996             "buffer\n"
 997             "@end table")
 998 #define FUNC_NAME s_scm_sys_symbol_dump
 999 {
1000   SCM e1, e2, e3, e4, e5, e6, e7;
1001   SCM buf;
1002   SCM_VALIDATE_SYMBOL (1, sym);
1003   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1004                  sym);
1005   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1006                  scm_from_ulong (scm_i_symbol_hash (sym)));
1007   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1008                  scm_symbol_interned_p (sym));
1009   buf = SYMBOL_STRINGBUF (sym);
1010
1011   /* Stringbuf info */
1012   if (!STRINGBUF_WIDE (buf))
1013     {
1014       size_t len = STRINGBUF_LENGTH (buf);
1015       char *cbuf;
1016       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1017       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1018       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1019                      sbc);
1020     }
1021   else
1022     {
1023       size_t len = STRINGBUF_LENGTH (buf);
1024       scm_t_wchar *cbuf;
1025       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1026       u32_cpy ((scm_t_uint32 *) cbuf,
1027                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1028       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1029                      sbc);
1030     }
1031   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1032                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1033   if (STRINGBUF_SHARED (buf))
1034     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1035                    SCM_BOOL_T);
1036   else
1037     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1038                    SCM_BOOL_F);
1039   if (STRINGBUF_WIDE (buf))
1040     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1041                     SCM_BOOL_T);
1042   else
1043     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1044                     SCM_BOOL_F);
1045   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1046
1047 }
1048 #undef FUNC_NAME
1049
1050 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1051
1052 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1053 #define FUNC_NAME s_scm_sys_stringbuf_hist
1054 {
1055   int i;
1056   for (i = 0; i < 1000; i++)
1057     if (lenhist[i])
1058       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1059   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1060   return SCM_UNSPECIFIED;
1061 }
1062 #undef FUNC_NAME
1063
1064 #endif
1065
1066 \f
1067
1068 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1069             (SCM obj),
1070             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1071 #define FUNC_NAME s_scm_string_p
1072 {
1073   return scm_from_bool (IS_STRING (obj));
1074 }
1075 #undef FUNC_NAME
1076
1077
1078 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1079
1080 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1081             (SCM chrs),
1082             "@deffnx {Scheme Procedure} list->string chrs\n"
1083             "Return a newly allocated string composed of the arguments,\n"
1084             "@var{chrs}.")
1085 #define FUNC_NAME s_scm_string
1086 {
1087   SCM result = SCM_BOOL_F;
1088   SCM rest;
1089   size_t len;
1090   size_t p = 0;
1091   long i;
1092   int wide = 0;
1093
1094   /* Verify that this is a list of chars.  */
1095   i = scm_ilength (chrs);
1096   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1097
1098   len = (size_t) i;
1099   rest = chrs;
1100
1101   while (len > 0 && scm_is_pair (rest))
1102     {
1103       SCM elt = SCM_CAR (rest);
1104       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1105       if (SCM_CHAR (elt) > 0xFF)
1106         wide = 1;
1107       rest = SCM_CDR (rest);
1108       len--;
1109       scm_remember_upto_here_1 (elt);
1110     }
1111
1112   /* Construct a string containing this list of chars.  */
1113   len = (size_t) i;
1114   rest = chrs;
1115
1116   if (wide == 0)
1117     {
1118       char *buf;
1119
1120       result = scm_i_make_string (len, NULL, 0);
1121       result = scm_i_string_start_writing (result);
1122       buf = scm_i_string_writable_chars (result);
1123       while (len > 0 && scm_is_pair (rest))
1124         {
1125           SCM elt = SCM_CAR (rest);
1126           buf[p] = (unsigned char) SCM_CHAR (elt);
1127           p++;
1128           rest = SCM_CDR (rest);
1129           len--;
1130           scm_remember_upto_here_1 (elt);
1131         }
1132     }
1133   else
1134     {
1135       scm_t_wchar *buf;
1136
1137       result = scm_i_make_wide_string (len, NULL, 0);
1138       result = scm_i_string_start_writing (result);
1139       buf = scm_i_string_writable_wide_chars (result);
1140       while (len > 0 && scm_is_pair (rest))
1141         {
1142           SCM elt = SCM_CAR (rest);
1143           buf[p] = SCM_CHAR (elt);
1144           p++;
1145           rest = SCM_CDR (rest);
1146           len--;
1147           scm_remember_upto_here_1 (elt);
1148         }
1149     }
1150   scm_i_string_stop_writing ();
1151
1152   if (len > 0)
1153     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1154   if (!scm_is_null (rest))
1155     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1156
1157   return result;
1158 }
1159 #undef FUNC_NAME
1160
1161 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1162             (SCM k, SCM chr),
1163             "Return a newly allocated string of\n"
1164             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1165             "the string are initialized to @var{chr}, otherwise the contents\n"
1166             "of the string are all set to @code{#\nul}.")
1167 #define FUNC_NAME s_scm_make_string
1168 {
1169   return scm_c_make_string (scm_to_size_t (k), chr);
1170 }
1171 #undef FUNC_NAME
1172
1173 SCM
1174 scm_c_make_string (size_t len, SCM chr)
1175 #define FUNC_NAME NULL
1176 {
1177   size_t p;
1178   char *contents = NULL;
1179   SCM res = scm_i_make_string (len, &contents, 0);
1180
1181   /* If no char is given, initialize string contents to NULL.  */
1182   if (SCM_UNBNDP (chr))
1183     memset (contents, 0, len);
1184   else
1185     {
1186       SCM_VALIDATE_CHAR (0, chr);
1187       res = scm_i_string_start_writing (res);
1188       for (p = 0; p < len; p++)
1189         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1190       scm_i_string_stop_writing ();
1191     }
1192
1193   return res;
1194 }
1195 #undef FUNC_NAME
1196
1197 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1198             (SCM string),
1199             "Return the number of characters in @var{string}.")
1200 #define FUNC_NAME s_scm_string_length
1201 {
1202   SCM_VALIDATE_STRING (1, string);
1203   return scm_from_size_t (STRING_LENGTH (string));
1204 }
1205 #undef FUNC_NAME
1206
1207 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1208             (SCM string),
1209             "Return the bytes used to represent a character in @var{string}."
1210             "This will return 1 or 4.")
1211 #define FUNC_NAME s_scm_string_bytes_per_char
1212 {
1213   SCM_VALIDATE_STRING (1, string);
1214   if (!scm_i_is_narrow_string (string))
1215     return scm_from_int (4);
1216
1217   return scm_from_int (1);
1218 }
1219 #undef FUNC_NAME
1220
1221 size_t
1222 scm_c_string_length (SCM string)
1223 {
1224   if (!IS_STRING (string))
1225     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1226   return STRING_LENGTH (string);
1227 }
1228
1229 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1230             (SCM str, SCM k),
1231             "Return character @var{k} of @var{str} using zero-origin\n"
1232             "indexing. @var{k} must be a valid index of @var{str}.")
1233 #define FUNC_NAME s_scm_string_ref
1234 {
1235   size_t len;
1236   unsigned long idx;
1237
1238   SCM_VALIDATE_STRING (1, str);
1239
1240   len = scm_i_string_length (str);
1241   if (SCM_LIKELY (len > 0))
1242     idx = scm_to_unsigned_integer (k, 0, len - 1);
1243   else
1244     scm_out_of_range (NULL, k);
1245
1246   if (scm_i_is_narrow_string (str))
1247     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1248   else
1249     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1250 }
1251 #undef FUNC_NAME
1252
1253 SCM
1254 scm_c_string_ref (SCM str, size_t p)
1255 {
1256   if (p >= scm_i_string_length (str))
1257     scm_out_of_range (NULL, scm_from_size_t (p));
1258   if (scm_i_is_narrow_string (str))
1259     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1260   else
1261     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1262
1263 }
1264
1265 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1266             (SCM str, SCM k, SCM chr),
1267             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1268             "an unspecified value. @var{k} must be a valid index of\n"
1269             "@var{str}.")
1270 #define FUNC_NAME s_scm_string_set_x
1271 {
1272   size_t len;
1273   unsigned long idx;
1274
1275   SCM_VALIDATE_STRING (1, str);
1276
1277   len = scm_i_string_length (str);
1278   if (SCM_LIKELY (len > 0))
1279     idx = scm_to_unsigned_integer (k, 0, len - 1);
1280   else
1281     scm_out_of_range (NULL, k);
1282
1283   SCM_VALIDATE_CHAR (3, chr);
1284   str = scm_i_string_start_writing (str);
1285   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1286   scm_i_string_stop_writing ();
1287
1288   return SCM_UNSPECIFIED;
1289 }
1290 #undef FUNC_NAME
1291
1292 void
1293 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1294 {
1295   if (p >= scm_i_string_length (str))
1296     scm_out_of_range (NULL, scm_from_size_t (p));
1297   str = scm_i_string_start_writing (str);
1298   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1299   scm_i_string_stop_writing ();
1300 }
1301
1302 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1303             (SCM str, SCM start, SCM end),
1304             "Return a newly allocated string formed from the characters\n"
1305             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1306             "ending with index @var{end} (exclusive).\n"
1307             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1308             "exact integers satisfying:\n\n"
1309             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1310 #define FUNC_NAME s_scm_substring
1311 {
1312   size_t len, from, to;
1313
1314   SCM_VALIDATE_STRING (1, str);
1315   len = scm_i_string_length (str);
1316   from = scm_to_unsigned_integer (start, 0, len);
1317   if (SCM_UNBNDP (end))
1318     to = len;
1319   else
1320     to = scm_to_unsigned_integer (end, from, len);
1321   return scm_i_substring (str, from, to);
1322 }
1323 #undef FUNC_NAME
1324
1325 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1326             (SCM str, SCM start, SCM end),
1327             "Return a newly allocated string formed from the characters\n"
1328             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1329             "ending with index @var{end} (exclusive).\n"
1330             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1331             "exact integers satisfying:\n"
1332             "\n"
1333             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1334             "\n"
1335             "The returned string is read-only.\n")
1336 #define FUNC_NAME s_scm_substring_read_only
1337 {
1338   size_t len, from, to;
1339
1340   SCM_VALIDATE_STRING (1, str);
1341   len = scm_i_string_length (str);
1342   from = scm_to_unsigned_integer (start, 0, len);
1343   if (SCM_UNBNDP (end))
1344     to = len;
1345   else
1346     to = scm_to_unsigned_integer (end, from, len);
1347   return scm_i_substring_read_only (str, from, to);
1348 }
1349 #undef FUNC_NAME
1350
1351 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1352             (SCM str, SCM start, SCM end),
1353             "Return a newly allocated string formed from the characters\n"
1354             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1355             "ending with index @var{end} (exclusive).\n"
1356             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1357             "exact integers satisfying:\n\n"
1358             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1359 #define FUNC_NAME s_scm_substring_copy
1360 {
1361   /* For the Scheme version, START is mandatory, but for the C
1362      version, it is optional.  See scm_string_copy in srfi-13.c for a
1363      rationale.
1364   */
1365
1366   size_t from, to;
1367
1368   SCM_VALIDATE_STRING (1, str);
1369   scm_i_get_substring_spec (scm_i_string_length (str),
1370                             start, &from, end, &to);
1371   return scm_i_substring_copy (str, from, to);
1372 }
1373 #undef FUNC_NAME
1374
1375 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1376             (SCM str, SCM start, SCM end),
1377             "Return string that indirectly refers to the characters\n"
1378             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1379             "ending with index @var{end} (exclusive).\n"
1380             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1381             "exact integers satisfying:\n\n"
1382             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1383 #define FUNC_NAME s_scm_substring_shared
1384 {
1385   size_t len, from, to;
1386
1387   SCM_VALIDATE_STRING (1, str);
1388   len = scm_i_string_length (str);
1389   from = scm_to_unsigned_integer (start, 0, len);
1390   if (SCM_UNBNDP (end))
1391     to = len;
1392   else
1393     to = scm_to_unsigned_integer (end, from, len);
1394   return scm_i_substring_shared (str, from, to);
1395 }
1396 #undef FUNC_NAME
1397
1398 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1399             (SCM args),
1400             "Return a newly allocated string whose characters form the\n"
1401             "concatenation of the given strings, @var{args}.")
1402 #define FUNC_NAME s_scm_string_append
1403 {
1404   SCM res;
1405   size_t len = 0;
1406   int wide = 0;
1407   SCM l, s;
1408   size_t i;
1409   union
1410   {
1411     char *narrow;
1412     scm_t_wchar *wide;
1413   } data;
1414
1415   SCM_VALIDATE_REST_ARGUMENT (args);
1416   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1417     {
1418       s = SCM_CAR (l);
1419       SCM_VALIDATE_STRING (SCM_ARGn, s);
1420       len += scm_i_string_length (s);
1421       if (!scm_i_is_narrow_string (s))
1422         wide = 1;
1423     }
1424   data.narrow = NULL;
1425   if (!wide)
1426     res = scm_i_make_string (len, &data.narrow, 0);
1427   else
1428     res = scm_i_make_wide_string (len, &data.wide, 0);
1429
1430   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1431     {
1432       size_t len;
1433       s = SCM_CAR (l);
1434       SCM_VALIDATE_STRING (SCM_ARGn, s);
1435       len = scm_i_string_length (s);
1436       if (!wide)
1437         {
1438           memcpy (data.narrow, scm_i_string_chars (s), len);
1439           data.narrow += len;
1440         }
1441       else
1442         {
1443           if (scm_i_is_narrow_string (s))
1444             {
1445               for (i = 0; i < scm_i_string_length (s); i++)
1446                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1447             }
1448           else
1449             u32_cpy ((scm_t_uint32 *) data.wide,
1450                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1451           data.wide += len;
1452         }
1453       scm_remember_upto_here_1 (s);
1454     }
1455   return res;
1456 }
1457 #undef FUNC_NAME
1458
1459
1460 \f
1461 /* Charset conversion error handling.  */
1462
1463 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1464 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1465
1466 /* Raise an exception informing that character CHR could not be written
1467    to PORT in its current encoding.  */
1468 void
1469 scm_encoding_error (const char *subr, int err, const char *message,
1470                     SCM port, SCM chr)
1471 {
1472   scm_throw (scm_encoding_error_key,
1473              scm_list_n (scm_from_latin1_string (subr),
1474                          scm_from_latin1_string (message),
1475                          scm_from_int (err),
1476                          port, chr,
1477                          SCM_UNDEFINED));
1478 }
1479
1480 /* Raise an exception informing of an encoding error on PORT.  This
1481    means that a character could not be written in PORT's encoding.  */
1482 void
1483 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1484 {
1485   scm_throw (scm_decoding_error_key,
1486              scm_list_n (scm_from_latin1_string (subr),
1487                          scm_from_latin1_string (message),
1488                          scm_from_int (err),
1489                          port,
1490                          SCM_UNDEFINED));
1491 }
1492
1493 \f
1494 /* String conversion to/from C.  */
1495
1496 static void
1497 decoding_error (const char *func_name, int errno_save,
1498                 const char *str, size_t len)
1499 {
1500   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1501      handler.  */
1502   SCM bv;
1503   signed char *buf;
1504
1505   buf = scm_gc_malloc_pointerless (len, "bytevector");
1506   memcpy (buf, str, len);
1507   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1508
1509   scm_decoding_error (func_name, errno_save,
1510                       "input locale conversion error", bv);
1511 }
1512
1513 SCM
1514 scm_from_stringn (const char *str, size_t len, const char *encoding,
1515                   scm_t_string_failed_conversion_handler handler)
1516 {
1517   size_t u32len, i;
1518   scm_t_wchar *u32;
1519   int wide = 0;
1520   SCM res;
1521
1522   /* The order of these checks is important. */
1523   if (!str && len != 0)
1524     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1525   if (len == (size_t) -1)
1526     len = strlen (str);
1527
1528   if (encoding == NULL || len == 0)
1529     return scm_from_latin1_stringn (str, len);
1530   else if (strcmp (encoding, "UTF-8") == 0)
1531     return scm_from_utf8_stringn (str, len);
1532
1533   u32len = 0;
1534   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1535                                                 (enum iconv_ilseq_handler)
1536                                                 handler,
1537                                                 str, len,
1538                                                 NULL,
1539                                                 NULL, &u32len);
1540
1541   if (SCM_UNLIKELY (u32 == NULL))
1542     decoding_error (__func__, errno, str, len);
1543
1544   i = 0;
1545   while (i < u32len)
1546     if (u32[i++] > 0xFF)
1547       {
1548         wide = 1;
1549         break;
1550       }
1551
1552   if (!wide)
1553     {
1554       char *dst;
1555       res = scm_i_make_string (u32len, &dst, 0);
1556       for (i = 0; i < u32len; i ++)
1557         dst[i] = (unsigned char) u32[i];
1558       dst[u32len] = '\0';
1559     }
1560   else
1561     {
1562       scm_t_wchar *wdst;
1563       res = scm_i_make_wide_string (u32len, &wdst, 0);
1564       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1565       wdst[u32len] = 0;
1566     }
1567
1568   free (u32);
1569   return res;
1570 }
1571
1572 SCM
1573 scm_from_locale_string (const char *str)
1574 {
1575   return scm_from_locale_stringn (str, -1);
1576 }
1577
1578 SCM
1579 scm_from_locale_stringn (const char *str, size_t len)
1580 {
1581   return scm_from_stringn (str, len, locale_charset (),
1582                            scm_i_get_conversion_strategy (SCM_BOOL_F));
1583 }
1584
1585 SCM
1586 scm_from_latin1_string (const char *str)
1587 {
1588   return scm_from_latin1_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_latin1_stringn (const char *str, size_t len)
1593 {
1594   char *buf;
1595   SCM result;
1596
1597   if (len == (size_t) -1)
1598     len = strlen (str);
1599
1600   /* Make a narrow string and copy STR as is.  */
1601   result = scm_i_make_string (len, &buf, 0);
1602   memcpy (buf, str, len);
1603
1604   return result;
1605 }
1606
1607 SCM
1608 scm_from_utf8_string (const char *str)
1609 {
1610   return scm_from_utf8_stringn (str, -1);
1611 }
1612
1613 SCM
1614 scm_from_utf8_stringn (const char *str, size_t len)
1615 {
1616   size_t i, char_len;
1617   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1618   int ascii = 1, narrow = 1;
1619   SCM res;
1620
1621   if (len == (size_t) -1)
1622     len = strlen (str);
1623
1624   i = 0;
1625   char_len = 0;
1626
1627   while (i < len)
1628     {
1629       if (ustr[i] <= 127)
1630         {
1631           char_len++;
1632           i++;
1633         }
1634       else
1635         {
1636           ucs4_t c;
1637           int nbytes;
1638
1639           ascii = 0;
1640
1641           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1642
1643           if (nbytes < 0)
1644             /* Bad UTF-8.  */
1645             decoding_error (__func__, errno, str, len);
1646
1647           if (c > 255)
1648             narrow = 0;
1649
1650           char_len++;
1651           i += nbytes;
1652         }
1653     }
1654
1655   if (ascii)
1656     {
1657       char *dst;
1658       res = scm_i_make_string (char_len, &dst, 0);
1659       memcpy (dst, str, len);
1660     }
1661   else if (narrow)
1662     {
1663       char *dst;
1664       size_t j;
1665       ucs4_t c;
1666
1667       res = scm_i_make_string (char_len, &dst, 0);
1668
1669       for (i = 0, j = 0; i < len; j++)
1670         {
1671           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1672           dst[j] = (signed char) c;
1673         }
1674     }
1675   else
1676     {
1677       scm_t_wchar *dst;
1678       size_t j;
1679       ucs4_t c;
1680
1681       res = scm_i_make_wide_string (char_len, &dst, 0);
1682
1683       for (i = 0, j = 0; i < len; j++)
1684         {
1685           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1686           dst[j] = c;
1687         }
1688     }
1689
1690   return res;
1691 }
1692
1693 SCM
1694 scm_from_utf32_string (const scm_t_wchar *str)
1695 {
1696   return scm_from_utf32_stringn (str, -1);
1697 }
1698
1699 SCM
1700 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1701 {
1702   SCM result;
1703   scm_t_wchar *buf;
1704
1705   if (len == (size_t) -1)
1706     len = u32_strlen ((uint32_t *) str);
1707
1708   result = scm_i_make_wide_string (len, &buf, 0);
1709   memcpy (buf, str, len * sizeof (scm_t_wchar));
1710   scm_i_try_narrow_string (result);
1711
1712   return result;
1713 }
1714
1715 /* Create a new scheme string from the C string STR.  The memory of
1716    STR may be used directly as storage for the new string.  */
1717 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1718    would be to register a finalizer to eventually free(3) STR, which isn't
1719    worth it.  Should we just deprecate the `scm_take_' functions?  */
1720 SCM
1721 scm_take_locale_stringn (char *str, size_t len)
1722 {
1723   SCM res;
1724
1725   res = scm_from_locale_stringn (str, len);
1726   free (str);
1727
1728   return res;
1729 }
1730
1731 SCM
1732 scm_take_locale_string (char *str)
1733 {
1734   return scm_take_locale_stringn (str, -1);
1735 }
1736
1737 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1738    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1739    Set *LENP to the size of the resulting string.
1740
1741    FIXME: This is a hack we should get rid of.  See
1742    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1743    for details.  */
1744 static void
1745 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1746 {
1747   char *before, *after;
1748   size_t i, j;
1749
1750   before = buf;
1751   after = buf;
1752   i = 0;
1753   j = 0;
1754   while (i < *lenp)
1755     {
1756       if ((i <= *lenp - 6)
1757           && before[i] == '\\'
1758           && before[i + 1] == 'u'
1759           && before[i + 2] == '0' && before[i + 3] == '0')
1760         {
1761           /* Convert \u00NN to \xNN */
1762           after[j] = '\\';
1763           after[j + 1] = 'x';
1764           after[j + 2] = tolower ((int) before[i + 4]);
1765           after[j + 3] = tolower ((int) before[i + 5]);
1766           i += 6;
1767           j += 4;
1768         }
1769       else if ((i <= *lenp - 10)
1770                && before[i] == '\\'
1771                && before[i + 1] == 'U'
1772                && before[i + 2] == '0' && before[i + 3] == '0')
1773         {
1774           /* Convert \U00NNNNNN to \UNNNNNN */
1775           after[j] = '\\';
1776           after[j + 1] = 'U';
1777           after[j + 2] = tolower ((int) before[i + 4]);
1778           after[j + 3] = tolower ((int) before[i + 5]);
1779           after[j + 4] = tolower ((int) before[i + 6]);
1780           after[j + 5] = tolower ((int) before[i + 7]);
1781           after[j + 6] = tolower ((int) before[i + 8]);
1782           after[j + 7] = tolower ((int) before[i + 9]);
1783           i += 10;
1784           j += 8;
1785         }
1786       else
1787         {
1788           after[j] = before[i];
1789           i++;
1790           j++;
1791         }
1792     }
1793   *lenp = j;
1794 }
1795
1796 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1797    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1798    of the resulting string.  BUF must be large enough to handle the
1799    worst case when `\uXXXX' escapes (6 characters) are replaced by
1800    `\xXXXX;' (7 characters).  */
1801 static void
1802 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1803 {
1804   char *before, *after;
1805   size_t i, j;
1806   /* The worst case is if the input string contains all 4-digit hex escapes.
1807      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1808   size_t max_out_len = (*lenp * 7) / 6 + 1;
1809   size_t nzeros, ndigits;
1810
1811   before = buf;
1812   after = alloca (max_out_len);
1813   i = 0;
1814   j = 0;
1815   while (i < *lenp)
1816     {
1817       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1818           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1819         {
1820           if (before[i + 1] == 'u')
1821             ndigits = 4;
1822           else if (before[i + 1] == 'U')
1823             ndigits = 8;
1824           else
1825             abort ();
1826
1827           /* Add the R6RS hex escape initial sequence.  */
1828           after[j] = '\\';
1829           after[j + 1] = 'x';
1830
1831           /* Move string positions to the start of the hex numbers.  */
1832           i += 2;
1833           j += 2;
1834
1835           /* Find the number of initial zeros in this hex number.  */
1836           nzeros = 0;
1837           while (before[i + nzeros] == '0' && nzeros < ndigits)
1838             nzeros++;
1839
1840           /* Copy the number, skipping initial zeros, and then move the string
1841              positions.  */
1842           if (nzeros == ndigits)
1843             {
1844               after[j] = '0';
1845               i += ndigits;
1846               j += 1;
1847             }
1848           else
1849             {
1850               int pos;
1851               for (pos = 0; pos < ndigits - nzeros; pos++)
1852                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1853               i += ndigits;
1854               j += (ndigits - nzeros);
1855             }
1856
1857           /* Add terminating semicolon.  */
1858           after[j] = ';';
1859           j++;
1860         }
1861       else
1862         {
1863           after[j] = before[i];
1864           i++;
1865           j++;
1866         }
1867     }
1868   *lenp = j;
1869   memcpy (before, after, j);
1870 }
1871
1872 char *
1873 scm_to_locale_string (SCM str)
1874 {
1875   return scm_to_locale_stringn (str, NULL);
1876 }
1877
1878 char *
1879 scm_to_locale_stringn (SCM str, size_t *lenp)
1880 {
1881   return scm_to_stringn (str, lenp,
1882                          locale_charset (),
1883                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1884 }
1885
1886 char *
1887 scm_to_latin1_string (SCM str)
1888 {
1889   return scm_to_latin1_stringn (str, NULL);
1890 }
1891
1892 char *
1893 scm_to_latin1_stringn (SCM str, size_t *lenp)
1894 #define FUNC_NAME "scm_to_latin1_stringn"
1895 {
1896   char *result;
1897
1898   SCM_VALIDATE_STRING (1, str);
1899
1900   if (scm_i_is_narrow_string (str))
1901     {
1902       size_t len = scm_i_string_length (str);
1903
1904       if (lenp)
1905         *lenp = len;
1906
1907       result = scm_strndup (scm_i_string_data (str), len);
1908     }
1909   else
1910     result = scm_to_stringn (str, lenp, NULL,
1911                              SCM_FAILED_CONVERSION_ERROR);
1912
1913   return result;
1914 }
1915 #undef FUNC_NAME
1916
1917 char *
1918 scm_to_utf8_string (SCM str)
1919 {
1920   return scm_to_utf8_stringn (str, NULL);
1921 }
1922
1923 static size_t
1924 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1925 {
1926   size_t ret, i;
1927   for (i = 0, ret = 0; i < len; i++)
1928     ret += (str[i] < 128) ? 1 : 2;
1929   return ret;
1930 }
1931
1932 static scm_t_uint8*
1933 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1934               scm_t_uint8 *u8_result, size_t *u8_lenp)
1935 {
1936   size_t i, n;
1937   size_t u8_len = latin1_u8_strlen (str, latin_len);
1938
1939   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1940     u8_result = scm_malloc (u8_len + 1);
1941   if (u8_lenp)
1942     *u8_lenp = u8_len;
1943
1944   for (i = 0, n = 0; i < latin_len; i++)
1945     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1946   if (n != u8_len)
1947     abort ();
1948   u8_result[n] = 0;
1949
1950   return u8_result;
1951 }
1952
1953 char *
1954 scm_to_utf8_stringn (SCM str, size_t *lenp)
1955 {
1956   if (scm_i_is_narrow_string (str))
1957     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
1958                                   scm_i_string_length (str),
1959                                   NULL, lenp);
1960   else
1961     return (char *) u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
1962                                scm_i_string_length (str),
1963                                NULL, lenp);
1964 }
1965
1966 scm_t_wchar *
1967 scm_to_utf32_string (SCM str)
1968 {
1969   return scm_to_utf32_stringn (str, NULL);
1970 }
1971
1972 scm_t_wchar *
1973 scm_to_utf32_stringn (SCM str, size_t *lenp)
1974 #define FUNC_NAME "scm_to_utf32_stringn"
1975 {
1976   scm_t_wchar *result;
1977
1978   SCM_VALIDATE_STRING (1, str);
1979
1980   if (scm_i_is_narrow_string (str))
1981     {
1982       scm_t_uint8 *codepoints;
1983       size_t i, len;
1984
1985       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
1986       len = scm_i_string_length (str);
1987       if (lenp)
1988         *lenp = len;
1989
1990       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1991       for (i = 0; i < len; i++)
1992         result[i] = codepoints[i];
1993       result[len] = 0;
1994     }
1995   else
1996     {
1997       size_t len;
1998
1999       len = scm_i_string_length (str);
2000       if (lenp)
2001         *lenp = len;
2002
2003       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2004       memcpy (result, scm_i_string_wide_chars (str),
2005               len * sizeof (scm_t_wchar));
2006       result[len] = 0;
2007     }
2008
2009   return result;
2010 }
2011 #undef FUNC_NAME
2012
2013 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2014    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2015    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2016    defined by HANDLER.  */
2017 char *
2018 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2019                 scm_t_string_failed_conversion_handler handler)
2020 {
2021   char *buf;
2022   size_t ilen, len, i;
2023   int ret;
2024   const char *enc;
2025
2026   if (!scm_is_string (str))
2027     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2028   ilen = scm_i_string_length (str);
2029
2030   if (ilen == 0)
2031     {
2032       buf = scm_malloc (1);
2033       buf[0] = '\0';
2034       if (lenp)
2035         *lenp = 0;
2036       return buf;
2037     }
2038
2039   if (lenp == NULL)
2040     for (i = 0; i < ilen; i++)
2041       if (scm_i_string_ref (str, i) == '\0')
2042         scm_misc_error (NULL,
2043                         "string contains #\\nul character: ~S",
2044                         scm_list_1 (str));
2045
2046   if (scm_i_is_narrow_string (str) && (encoding == NULL))
2047     {
2048       /* If using native Latin-1 encoding, just copy the string
2049          contents.  */
2050       if (lenp)
2051         {
2052           buf = scm_malloc (ilen);
2053           memcpy (buf, scm_i_string_chars (str), ilen);
2054           *lenp = ilen;
2055           return buf;
2056         }
2057       else
2058         {
2059           buf = scm_malloc (ilen + 1);
2060           memcpy (buf, scm_i_string_chars (str), ilen);
2061           buf[ilen] = '\0';
2062           return buf;
2063         }
2064     }
2065
2066
2067   buf = NULL;
2068   len = 0;
2069   enc = encoding;
2070   if (enc == NULL)
2071     enc = "ISO-8859-1";
2072   if (scm_i_is_narrow_string (str))
2073     {
2074       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2075                          "ISO-8859-1", enc,
2076                          (enum iconv_ilseq_handler) handler, NULL,
2077                          &buf, &len);
2078
2079       if (ret != 0)
2080         scm_encoding_error (__func__, errno,
2081                             "cannot convert narrow string to output locale",
2082                             SCM_BOOL_F,
2083                             /* FIXME: Faulty character unknown.  */
2084                             SCM_BOOL_F);
2085     }
2086   else
2087     {
2088       buf = u32_conv_to_encoding (enc,
2089                                   (enum iconv_ilseq_handler) handler,
2090                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2091                                   ilen,
2092                                   NULL,
2093                                   NULL, &len);
2094       if (buf == NULL)
2095         scm_encoding_error (__func__, errno,
2096                             "cannot convert wide string to output locale",
2097                             SCM_BOOL_F,
2098                             /* FIXME: Faulty character unknown.  */
2099                             SCM_BOOL_F);
2100     }
2101   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2102     {
2103       if (SCM_R6RS_ESCAPES_P)
2104         {
2105           /* The worst case is if the input string contains all 4-digit
2106              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2107              (seven characters).  Make BUF large enough to hold
2108              that.  */
2109           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2110           unistring_escapes_to_r6rs_escapes (buf, &len);
2111         }
2112       else
2113         unistring_escapes_to_guile_escapes (buf, &len);
2114
2115       buf = scm_realloc (buf, len);
2116     }
2117   if (lenp)
2118     *lenp = len;
2119   else
2120     {
2121       buf = scm_realloc (buf, len + 1);
2122       buf[len] = '\0';
2123     }
2124
2125   scm_remember_upto_here_1 (str);
2126   return buf;
2127 }
2128
2129 size_t
2130 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2131 {
2132   size_t len;
2133   char *result = NULL;
2134   if (!scm_is_string (str))
2135     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2136   result = scm_to_locale_stringn (str, &len);
2137
2138   memcpy (buf, result, (len > max_len) ? max_len : len);
2139   free (result);
2140
2141   scm_remember_upto_here_1 (str);
2142   return len;
2143 }
2144
2145 \f
2146 /* Unicode string normalization.  */
2147
2148 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2149    libguile/i18n.c.  It would be useful to have this factored out into a more
2150    convenient location, but its use of alloca makes that tricky to do. */
2151
2152 static SCM
2153 normalize_str (SCM string, uninorm_t form)
2154 {
2155   SCM ret;
2156   scm_t_uint32 *w_str;
2157   scm_t_wchar *cbuf;
2158   size_t rlen, len = scm_i_string_length (string);
2159
2160   if (scm_i_is_narrow_string (string))
2161     {
2162       size_t i;
2163       const char *buf = scm_i_string_chars (string);
2164
2165       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2166
2167       for (i = 0; i < len; i ++)
2168         w_str[i] = (unsigned char) buf[i];
2169       w_str[len] = 0;
2170     }
2171   else
2172     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2173
2174   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2175
2176   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2177   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2178   free (w_str);
2179
2180   scm_i_try_narrow_string (ret);
2181
2182   return ret;
2183 }
2184
2185 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2186             (SCM string),
2187             "Returns the NFC normalized form of @var{string}.")
2188 #define FUNC_NAME s_scm_string_normalize_nfc
2189 {
2190   SCM_VALIDATE_STRING (1, string);
2191   return normalize_str (string, UNINORM_NFC);
2192 }
2193 #undef FUNC_NAME
2194
2195 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2196             (SCM string),
2197             "Returns the NFD normalized form of @var{string}.")
2198 #define FUNC_NAME s_scm_string_normalize_nfd
2199 {
2200   SCM_VALIDATE_STRING (1, string);
2201   return normalize_str (string, UNINORM_NFD);
2202 }
2203 #undef FUNC_NAME
2204
2205 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2206             (SCM string),
2207             "Returns the NFKC normalized form of @var{string}.")
2208 #define FUNC_NAME s_scm_string_normalize_nfkc
2209 {
2210   SCM_VALIDATE_STRING (1, string);
2211   return normalize_str (string, UNINORM_NFKC);
2212 }
2213 #undef FUNC_NAME
2214
2215 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2216             (SCM string),
2217             "Returns the NFKD normalized form of @var{string}.")
2218 #define FUNC_NAME s_scm_string_normalize_nfkd
2219 {
2220   SCM_VALIDATE_STRING (1, string);
2221   return normalize_str (string, UNINORM_NFKD);
2222 }
2223 #undef FUNC_NAME
2224
2225 /* converts C scm_array of strings to SCM scm_list of strings.
2226    If argc < 0, a null terminated scm_array is assumed.
2227    The current locale encoding is assumed */
2228 SCM
2229 scm_makfromstrs (int argc, char **argv)
2230 {
2231   int i = argc;
2232   SCM lst = SCM_EOL;
2233   if (0 > i)
2234     for (i = 0; argv[i]; i++);
2235   while (i--)
2236     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2237   return lst;
2238 }
2239
2240 /* Return a newly allocated array of char pointers to each of the strings
2241    in args, with a terminating NULL pointer.  The strings are encoded using
2242    the current locale. */
2243
2244 char **
2245 scm_i_allocate_string_pointers (SCM list)
2246 #define FUNC_NAME "scm_i_allocate_string_pointers"
2247 {
2248   char **result;
2249   int list_len = scm_ilength (list);
2250   int i;
2251
2252   if (list_len < 0)
2253     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2254
2255   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2256                           "string pointers");
2257   result[list_len] = NULL;
2258
2259   /* The list might have been modified in another thread, so
2260      we check LIST before each access.
2261    */
2262   for (i = 0; i < list_len && scm_is_pair (list); i++)
2263     {
2264       SCM str = SCM_CAR (list);
2265       size_t len;  /* String length in bytes */
2266       char *c_str = scm_to_locale_stringn (str, &len);
2267
2268       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2269          scm_malloc to allocate the returned string, which must be
2270          explicitly deallocated.  This forces us to copy the string a
2271          second time into a new buffer.  Ideally there would be variants
2272          of scm_to_*_stringn that can return garbage-collected buffers. */
2273
2274       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2275       memcpy (result[i], c_str, len);
2276       result[i][len] = '\0';
2277       free (c_str);
2278
2279       list = SCM_CDR (list);
2280     }
2281
2282   return result;
2283 }
2284 #undef FUNC_NAME
2285
2286 void
2287 scm_i_get_substring_spec (size_t len,
2288                           SCM start, size_t *cstart,
2289                           SCM end, size_t *cend)
2290 {
2291   if (SCM_UNBNDP (start))
2292     *cstart = 0;
2293   else
2294     *cstart = scm_to_unsigned_integer (start, 0, len);
2295
2296   if (SCM_UNBNDP (end))
2297     *cend = len;
2298   else
2299     *cend = scm_to_unsigned_integer (end, *cstart, len);
2300 }
2301
2302 static SCM
2303 string_handle_ref (scm_t_array_handle *h, size_t index)
2304 {
2305   return scm_c_string_ref (h->array, index);
2306 }
2307
2308 static void
2309 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2310 {
2311   scm_c_string_set_x (h->array, index, val);
2312 }
2313
2314 static void
2315 string_get_handle (SCM v, scm_t_array_handle *h)
2316 {
2317   h->array = v;
2318   h->ndims = 1;
2319   h->dims = &h->dim0;
2320   h->dim0.lbnd = 0;
2321   h->dim0.ubnd = scm_c_string_length (v) - 1;
2322   h->dim0.inc = 1;
2323   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2324   h->elements = h->writable_elements = NULL;
2325 }
2326
2327 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2328                           string_handle_ref, string_handle_set,
2329                           string_get_handle)
2330 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2331
2332 void
2333 scm_init_strings ()
2334 {
2335   scm_nullstr = scm_i_make_string (0, NULL, 0);
2336
2337 #include "libguile/strings.x"
2338 }
2339
2340
2341 /*
2342   Local Variables:
2343   c-file-style: "gnu"
2344   End:
2345 */