libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 229
 230 \f
 231 /* Copy-on-write strings.
 232  */
 233
 234 #define STRING_TAG            scm_tc7_string
 235
 236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 237 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 238 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 239
 240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 242
 243 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 244
 245 /* Read-only strings.
 246  */
 247
 248 #define RO_STRING_TAG         scm_tc7_ro_string
 249 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 250
 251 /* Mutation-sharing substrings
 252  */
 253
 254 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 255
 256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 257 /* START and LENGTH as for STRINGs. */
 258
 259 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 260
 261 SCM scm_nullstr;
 262
 263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 264    characters.  CHARSP, if not NULL, will be set to location of the
 265    char array.  If READ_ONLY_P, the returned string is read-only;
 266    otherwise it is writable.  */
 267 SCM
 268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 269 {
 270   static SCM null_stringbuf = SCM_BOOL_F;
 271   SCM buf;
 272   SCM res;
 273
 274   if (len == 0)
 275     {
 276       if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
 277         {
 278           null_stringbuf = make_stringbuf (0);
 279           SET_STRINGBUF_SHARED (null_stringbuf);
 280         }
 281       buf = null_stringbuf;
 282     }
 283   else
 284     buf = make_stringbuf (len);
 285
 286   if (charsp)
 287     *charsp = (char *) STRINGBUF_CHARS (buf);
 288   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 289                          SCM_UNPACK (buf),
 290                          (scm_t_bits) 0, (scm_t_bits) len);
 291   return res;
 292 }
 293
 294 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 295    characters.  CHARSP, if not NULL, will be set to location of the
 296    character array.  If READ_ONLY_P, the returned string is read-only;
 297    otherwise it is writable.  */
 298 SCM
 299 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 300 {
 301   SCM buf = make_wide_stringbuf (len);
 302   SCM res;
 303   if (charsp)
 304     *charsp = STRINGBUF_WIDE_CHARS (buf);
 305   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 306                          SCM_UNPACK (buf),
 307                          (scm_t_bits) 0, (scm_t_bits) len);
 308   return res;
 309 }
 310
 311 static void
 312 validate_substring_args (SCM str, size_t start, size_t end)
 313 {
 314   if (!IS_STRING (str))
 315     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 316   if (start > STRING_LENGTH (str))
 317     scm_out_of_range (NULL, scm_from_size_t (start));
 318   if (end > STRING_LENGTH (str) || end < start)
 319     scm_out_of_range (NULL, scm_from_size_t (end));
 320 }
 321
 322 static inline void
 323 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 324 {
 325   *start = STRING_START (*str);
 326   if (IS_SH_STRING (*str))
 327     {
 328       *str = SH_STRING_STRING (*str);
 329       *start += STRING_START (*str);
 330     }
 331   *buf = STRING_STRINGBUF (*str);
 332 }
 333
 334 SCM
 335 scm_i_substring (SCM str, size_t start, size_t end)
 336 {
 337   if (start == end)
 338     return scm_i_make_string (0, NULL, 0);
 339   else
 340     {
 341       SCM buf;
 342       size_t str_start;
 343       get_str_buf_start (&str, &buf, &str_start);
 344       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 345       SET_STRINGBUF_SHARED (buf);
 346       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 347       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 348                               (scm_t_bits)str_start + start,
 349                               (scm_t_bits) end - start);
 350     }
 351 }
 352
 353 SCM
 354 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 355 {
 356   if (start == end)
 357     return scm_i_make_string (0, NULL, 1);
 358   else
 359     {
 360       SCM buf;
 361       size_t str_start;
 362       get_str_buf_start (&str, &buf, &str_start);
 363       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 364       SET_STRINGBUF_SHARED (buf);
 365       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 366       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 367                               (scm_t_bits)str_start + start,
 368                               (scm_t_bits) end - start);
 369     }
 370 }
 371
 372 SCM
 373 scm_i_substring_copy (SCM str, size_t start, size_t end)
 374 {
 375   size_t len = end - start;
 376   SCM buf, my_buf, substr;
 377   size_t str_start;
 378   int wide = 0;
 379   get_str_buf_start (&str, &buf, &str_start);
 380   if (scm_i_is_narrow_string (str))
 381     {
 382       my_buf = make_stringbuf (len);
 383       memcpy (STRINGBUF_CHARS (my_buf),
 384               STRINGBUF_CHARS (buf) + str_start + start, len);
 385     }
 386   else
 387     {
 388       my_buf = make_wide_stringbuf (len);
 389       u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 390                (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 391                                  + start), len);
 392       wide = 1;
 393     }
 394   scm_remember_upto_here_1 (buf);
 395   substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 396                             (scm_t_bits) 0, (scm_t_bits) len);
 397   if (wide)
 398     scm_i_try_narrow_string (substr);
 399   return substr;
 400 }
 401
 402 SCM
 403 scm_i_substring_shared (SCM str, size_t start, size_t end)
 404 {
 405   if (start == 0 && end == STRING_LENGTH (str))
 406     return str;
 407   else if (start == end)
 408     return scm_i_make_string (0, NULL, 0);
 409   else
 410     {
 411       size_t len = end - start;
 412       if (IS_SH_STRING (str))
 413         {
 414           start += STRING_START (str);
 415           str = SH_STRING_STRING (str);
 416         }
 417       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 418                               (scm_t_bits)start, (scm_t_bits) len);
 419     }
 420 }
 421
 422 SCM
 423 scm_c_substring (SCM str, size_t start, size_t end)
 424 {
 425   validate_substring_args (str, start, end);
 426   return scm_i_substring (str, start, end);
 427 }
 428
 429 SCM
 430 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 431 {
 432   validate_substring_args (str, start, end);
 433   return scm_i_substring_read_only (str, start, end);
 434 }
 435
 436 SCM
 437 scm_c_substring_copy (SCM str, size_t start, size_t end)
 438 {
 439   validate_substring_args (str, start, end);
 440   return scm_i_substring_copy (str, start, end);
 441 }
 442
 443 SCM
 444 scm_c_substring_shared (SCM str, size_t start, size_t end)
 445 {
 446   validate_substring_args (str, start, end);
 447   return scm_i_substring_shared (str, start, end);
 448 }
 449
 450 \f
 451 /* Internal accessors
 452  */
 453
 454 /* Returns the number of characters in STR.  This may be different
 455    than the memory size of the string storage.  */
 456 size_t
 457 scm_i_string_length (SCM str)
 458 {
 459   return STRING_LENGTH (str);
 460 }
 461
 462 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 463    encoding.  False if it is 'wide', having a 32-bit UCS-4
 464    encoding.  */
 465 int
 466 scm_i_is_narrow_string (SCM str)
 467 {
 468   if (IS_SH_STRING (str))
 469     str = SH_STRING_STRING (str);
 470
 471   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 472 }
 473
 474 /* Try to coerce a string to be narrow.  It if is narrow already, do
 475    nothing.  If it is wide, shrink it to narrow if none of its
 476    characters are above 0xFF.  Return true if the string is narrow or
 477    was made to be narrow.  */
 478 int
 479 scm_i_try_narrow_string (SCM str)
 480 {
 481   if (IS_SH_STRING (str))
 482     str = SH_STRING_STRING (str);
 483
 484   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 485
 486   return scm_i_is_narrow_string (str);
 487 }
 488
 489 /* Return a pointer to the raw data of the string, which can be either Latin-1
 490    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 491 const void *
 492 scm_i_string_data (SCM str)
 493 {
 494   SCM buf;
 495   size_t start;
 496   const char *data;
 497
 498   get_str_buf_start (&str, &buf, &start);
 499
 500   data = STRINGBUF_CONTENTS (buf);
 501   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 502
 503   return data;
 504 }
 505
 506 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 507    STR.  */
 508 const char *
 509 scm_i_string_chars (SCM str)
 510 {
 511   SCM buf;
 512   size_t start;
 513   get_str_buf_start (&str, &buf, &start);
 514   if (scm_i_is_narrow_string (str))
 515     return (const char *) STRINGBUF_CHARS (buf) + start;
 516   else
 517     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 518                     scm_list_1 (str));
 519   return NULL;
 520 }
 521
 522 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 523    STR.  */
 524 const scm_t_wchar *
 525 scm_i_string_wide_chars (SCM str)
 526 {
 527   SCM buf;
 528   size_t start;
 529
 530   get_str_buf_start (&str, &buf, &start);
 531   if (!scm_i_is_narrow_string (str))
 532     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 533   else
 534     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 535                     scm_list_1 (str));
 536 }
 537
 538 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 539    a new string buffer, so that it can be modified without modifying
 540    other strings.  Also, lock the string mutex.  Later, one must call
 541    scm_i_string_stop_writing to unlock the mutex.  */
 542 SCM
 543 scm_i_string_start_writing (SCM orig_str)
 544 {
 545   SCM buf, str = orig_str;
 546   size_t start;
 547
 548   get_str_buf_start (&str, &buf, &start);
 549   if (IS_RO_STRING (str))
 550     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 551
 552   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 553   if (STRINGBUF_SHARED (buf))
 554     {
 555       /* Clone the stringbuf.  */
 556       size_t len = STRING_LENGTH (str);
 557       SCM new_buf;
 558
 559       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 560
 561       if (scm_i_is_narrow_string (str))
 562         {
 563           new_buf = make_stringbuf (len);
 564           memcpy (STRINGBUF_CHARS (new_buf),
 565                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 566
 567         }
 568       else
 569         {
 570           new_buf = make_wide_stringbuf (len);
 571           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 572                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 573                                      + STRING_START (str)), len);
 574         }
 575
 576       SET_STRING_STRINGBUF (str, new_buf);
 577       start -= STRING_START (str);
 578
 579       /* FIXME: The following operations are not atomic, so other threads
 580          looking at STR may see an inconsistent state.  Nevertheless it can't
 581          hurt much since (i) accessing STR while it is being mutated can't
 582          yield a crash, and (ii) concurrent accesses to STR should be
 583          protected by a mutex at the application level.  The latter may not
 584          apply when STR != ORIG_STR, though.  */
 585       SET_STRING_START (str, 0);
 586       SET_STRING_STRINGBUF (str, new_buf);
 587
 588       buf = new_buf;
 589
 590       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 591     }
 592   return orig_str;
 593 }
 594
 595 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 596 char *
 597 scm_i_string_writable_chars (SCM str)
 598 {
 599   SCM buf;
 600   size_t start;
 601
 602   get_str_buf_start (&str, &buf, &start);
 603   if (scm_i_is_narrow_string (str))
 604     return (char *) STRINGBUF_CHARS (buf) + start;
 605   else
 606     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 607                     scm_list_1 (str));
 608   return NULL;
 609 }
 610
 611 /* Return a pointer to the UCS-4 codepoints of a string.  */
 612 static scm_t_wchar *
 613 scm_i_string_writable_wide_chars (SCM str)
 614 {
 615   SCM buf;
 616   size_t start;
 617
 618   get_str_buf_start (&str, &buf, &start);
 619   if (!scm_i_is_narrow_string (str))
 620     return STRINGBUF_WIDE_CHARS (buf) + start;
 621   else
 622     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 623                     scm_list_1 (str));
 624 }
 625
 626 /* Unlock the string mutex that was locked when
 627    scm_i_string_start_writing was called.  */
 628 void
 629 scm_i_string_stop_writing (void)
 630 {
 631   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 632 }
 633
 634 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 635 scm_t_wchar
 636 scm_i_string_ref (SCM str, size_t x)
 637 {
 638   if (scm_i_is_narrow_string (str))
 639     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 640   else
 641     return scm_i_string_wide_chars (str)[x];
 642 }
 643
 644 /* Returns index+1 of the first char in STR that matches C, or
 645    0 if the char is not found.  */
 646 int
 647 scm_i_string_contains_char (SCM str, char ch)
 648 {
 649   size_t i;
 650   size_t len = scm_i_string_length (str);
 651
 652   i = 0;
 653   if (scm_i_is_narrow_string (str))
 654     {
 655       while (i < len)
 656         {
 657           if (scm_i_string_chars (str)[i] == ch)
 658             return i+1;
 659           i++;
 660         }
 661     }
 662   else
 663     {
 664       while (i < len)
 665         {
 666           if (scm_i_string_wide_chars (str)[i]
 667               == (unsigned char) ch)
 668             return i+1;
 669           i++;
 670         }
 671     }
 672   return 0;
 673 }
 674
 675 int
 676 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 677 {
 678   if (scm_i_is_narrow_string (sstr))
 679     {
 680       const char *a = scm_i_string_chars (sstr) + start_x;
 681       const char *b = cstr;
 682       return strncmp (a, b, strlen(b));
 683     }
 684   else
 685     {
 686       size_t i;
 687       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 688       const char *b = cstr;
 689       for (i = 0; i < strlen (b); i++)
 690         {
 691           if (a[i] != (unsigned char) b[i])
 692             return 1;
 693         }
 694     }
 695   return 0;
 696 }
 697
 698 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 699 void
 700 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 701 {
 702   if (IS_SH_STRING (str))
 703     {
 704       p += STRING_START (str);
 705       str = SH_STRING_STRING (str);
 706     }
 707
 708   if (chr > 0xFF && scm_i_is_narrow_string (str))
 709     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 710
 711   if (scm_i_is_narrow_string (str))
 712     {
 713       char *dst = scm_i_string_writable_chars (str);
 714       dst[p] = chr;
 715     }
 716   else
 717     {
 718       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 719       dst[p] = chr;
 720     }
 721 }
 722
 723 \f
 724 /* Symbols.
 725
 726    Basic symbol creation and accessing is done here, the rest is in
 727    symbols.[hc].  This has been done to keep stringbufs and the
 728    internals of strings and string-like objects confined to this file.
 729 */
 730
 731 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 732
 733 SCM
 734 scm_i_make_symbol (SCM name, scm_t_bits flags,
 735                    unsigned long hash, SCM props)
 736 {
 737   SCM buf;
 738   size_t start = STRING_START (name);
 739   size_t length = STRING_LENGTH (name);
 740
 741   if (IS_SH_STRING (name))
 742     {
 743       name = SH_STRING_STRING (name);
 744       start += STRING_START (name);
 745     }
 746   buf = SYMBOL_STRINGBUF (name);
 747
 748   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 749     {
 750       /* reuse buf. */
 751       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 752       SET_STRINGBUF_SHARED (buf);
 753       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 754     }
 755   else
 756     {
 757       /* make new buf. */
 758       if (scm_i_is_narrow_string (name))
 759         {
 760           SCM new_buf = make_stringbuf (length);
 761           memcpy (STRINGBUF_CHARS (new_buf),
 762                   STRINGBUF_CHARS (buf) + start, length);
 763           buf = new_buf;
 764         }
 765       else
 766         {
 767           SCM new_buf = make_wide_stringbuf (length);
 768           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 769                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 770                    length);
 771           buf = new_buf;
 772         }
 773     }
 774   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 775                           (scm_t_bits) hash, SCM_UNPACK (props));
 776 }
 777
 778 SCM
 779 scm_i_c_make_symbol (const char *name, size_t len,
 780                      scm_t_bits flags, unsigned long hash, SCM props)
 781 {
 782   SCM buf = make_stringbuf (len);
 783   memcpy (STRINGBUF_CHARS (buf), name, len);
 784
 785   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 786                           (scm_t_bits) hash, SCM_UNPACK (props));
 787 }
 788
 789 /* Returns the number of characters in SYM.  This may be different
 790    from the memory size of SYM.  */
 791 size_t
 792 scm_i_symbol_length (SCM sym)
 793 {
 794   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 795 }
 796
 797 size_t
 798 scm_c_symbol_length (SCM sym)
 799 #define FUNC_NAME "scm_c_symbol_length"
 800 {
 801   SCM_VALIDATE_SYMBOL (1, sym);
 802
 803   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 804 }
 805 #undef FUNC_NAME
 806
 807 /* True if the name of SYM is stored as a Latin-1 encoded string.
 808    False if it is stored as a 32-bit UCS-4-encoded string.  */
 809 int
 810 scm_i_is_narrow_symbol (SCM sym)
 811 {
 812   SCM buf;
 813
 814   buf = SYMBOL_STRINGBUF (sym);
 815   return !STRINGBUF_WIDE (buf);
 816 }
 817
 818 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 819    contains the name of SYM.  */
 820 const char *
 821 scm_i_symbol_chars (SCM sym)
 822 {
 823   SCM buf;
 824
 825   buf = SYMBOL_STRINGBUF (sym);
 826   if (!STRINGBUF_WIDE (buf))
 827     return (const char *) STRINGBUF_CHARS (buf);
 828   else
 829     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 830                     scm_list_1 (sym));
 831 }
 832
 833 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 834    symbol's name.  */
 835 const scm_t_wchar *
 836 scm_i_symbol_wide_chars (SCM sym)
 837 {
 838   SCM buf;
 839
 840   buf = SYMBOL_STRINGBUF (sym);
 841   if (STRINGBUF_WIDE (buf))
 842     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 843   else
 844     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 845                     scm_list_1 (sym));
 846 }
 847
 848 SCM
 849 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 850 {
 851   SCM buf = SYMBOL_STRINGBUF (sym);
 852   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 853   SET_STRINGBUF_SHARED (buf);
 854   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 855   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 856                           (scm_t_bits)start, (scm_t_bits) end - start);
 857 }
 858
 859 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 860 scm_t_wchar
 861 scm_i_symbol_ref (SCM sym, size_t x)
 862 {
 863   if (scm_i_is_narrow_symbol (sym))
 864     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 865   else
 866     return scm_i_symbol_wide_chars (sym)[x];
 867 }
 868
 869 /* Debugging
 870  */
 871
 872 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 873             "Returns an association list containing debugging information\n"
 874             "for @var{str}. The association list has the following entries."
 875             "@table @code\n"
 876             "@item string\n"
 877             "The string itself.\n"
 878             "@item start\n"
 879             "The start index of the string into its stringbuf\n"
 880             "@item length\n"
 881             "The length of the string\n"
 882             "@item shared\n"
 883             "If this string is a substring, it returns its parent string.\n"
 884             "Otherwise, it returns @code{#f}\n"
 885             "@item read-only\n"
 886             "@code{#t} if the string is read-only\n"
 887             "@item stringbuf-chars\n"
 888             "A new string containing this string's stringbuf's characters\n"
 889             "@item stringbuf-length\n"
 890             "The number of characters in this stringbuf\n"
 891             "@item stringbuf-shared\n"
 892             "@code{#t} if this stringbuf is shared\n"
 893             "@item stringbuf-wide\n"
 894             "@code{#t} if this stringbuf's characters are stored in a\n"
 895             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 896             "buffer\n"
 897             "@end table")
 898 #define FUNC_NAME s_scm_sys_string_dump
 899 {
 900   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 901   SCM buf;
 902   SCM_VALIDATE_STRING (1, str);
 903
 904   /* String info */
 905   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 906                  str);
 907   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 908                  scm_from_size_t (STRING_START (str)));
 909   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 910                  scm_from_size_t (STRING_LENGTH (str)));
 911
 912   if (IS_SH_STRING (str))
 913     {
 914       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 915                      SH_STRING_STRING (str));
 916       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 917     }
 918   else
 919     {
 920       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 921                      SCM_BOOL_F);
 922       buf = STRING_STRINGBUF (str);
 923     }
 924
 925   if (IS_RO_STRING (str))
 926     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 927                    SCM_BOOL_T);
 928   else
 929     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 930                    SCM_BOOL_F);
 931
 932   /* Stringbuf info */
 933   if (!STRINGBUF_WIDE (buf))
 934     {
 935       size_t len = STRINGBUF_LENGTH (buf);
 936       char *cbuf;
 937       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 938       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 939       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 940                      sbc);
 941     }
 942   else
 943     {
 944       size_t len = STRINGBUF_LENGTH (buf);
 945       scm_t_wchar *cbuf;
 946       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 947       u32_cpy ((scm_t_uint32 *) cbuf,
 948                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 949       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 950                      sbc);
 951     }
 952   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 953                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 954   if (STRINGBUF_SHARED (buf))
 955     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 956                    SCM_BOOL_T);
 957   else
 958     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 959                    SCM_BOOL_F);
 960   if (STRINGBUF_WIDE (buf))
 961     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 962                    SCM_BOOL_T);
 963   else
 964     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 965                    SCM_BOOL_F);
 966
 967   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 968 }
 969 #undef FUNC_NAME
 970
 971 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 972             "Returns an association list containing debugging information\n"
 973             "for @var{sym}. The association list has the following entries."
 974             "@table @code\n"
 975             "@item symbol\n"
 976             "The symbol itself\n"
 977             "@item hash\n"
 978             "Its hash value\n"
 979             "@item interned\n"
 980             "@code{#t} if it is an interned symbol\n"
 981             "@item stringbuf-chars\n"
 982             "A new string containing this symbols's stringbuf's characters\n"
 983             "@item stringbuf-length\n"
 984             "The number of characters in this stringbuf\n"
 985             "@item stringbuf-shared\n"
 986             "@code{#t} if this stringbuf is shared\n"
 987             "@item stringbuf-wide\n"
 988             "@code{#t} if this stringbuf's characters are stored in a\n"
 989             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 990             "buffer\n"
 991             "@end table")
 992 #define FUNC_NAME s_scm_sys_symbol_dump
 993 {
 994   SCM e1, e2, e3, e4, e5, e6, e7;
 995   SCM buf;
 996   SCM_VALIDATE_SYMBOL (1, sym);
 997   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
 998                  sym);
 999   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1000                  scm_from_ulong (scm_i_symbol_hash (sym)));
1001   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1002                  scm_symbol_interned_p (sym));
1003   buf = SYMBOL_STRINGBUF (sym);
1004
1005   /* Stringbuf info */
1006   if (!STRINGBUF_WIDE (buf))
1007     {
1008       size_t len = STRINGBUF_LENGTH (buf);
1009       char *cbuf;
1010       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1011       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1012       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1013                      sbc);
1014     }
1015   else
1016     {
1017       size_t len = STRINGBUF_LENGTH (buf);
1018       scm_t_wchar *cbuf;
1019       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1020       u32_cpy ((scm_t_uint32 *) cbuf,
1021                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1022       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1023                      sbc);
1024     }
1025   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1026                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1027   if (STRINGBUF_SHARED (buf))
1028     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1029                    SCM_BOOL_T);
1030   else
1031     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1032                    SCM_BOOL_F);
1033   if (STRINGBUF_WIDE (buf))
1034     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1035                     SCM_BOOL_T);
1036   else
1037     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1038                     SCM_BOOL_F);
1039   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1040
1041 }
1042 #undef FUNC_NAME
1043
1044 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1045
1046 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1047 #define FUNC_NAME s_scm_sys_stringbuf_hist
1048 {
1049   int i;
1050   for (i = 0; i < 1000; i++)
1051     if (lenhist[i])
1052       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1053   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1054   return SCM_UNSPECIFIED;
1055 }
1056 #undef FUNC_NAME
1057
1058 #endif
1059
1060 \f
1061
1062 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1063             (SCM obj),
1064             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1065 #define FUNC_NAME s_scm_string_p
1066 {
1067   return scm_from_bool (IS_STRING (obj));
1068 }
1069 #undef FUNC_NAME
1070
1071
1072 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1073
1074 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1075             (SCM chrs),
1076             "@deffnx {Scheme Procedure} list->string chrs\n"
1077             "Return a newly allocated string composed of the arguments,\n"
1078             "@var{chrs}.")
1079 #define FUNC_NAME s_scm_string
1080 {
1081   SCM result = SCM_BOOL_F;
1082   SCM rest;
1083   size_t len;
1084   size_t p = 0;
1085   long i;
1086   int wide = 0;
1087
1088   /* Verify that this is a list of chars.  */
1089   i = scm_ilength (chrs);
1090   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1091
1092   len = (size_t) i;
1093   rest = chrs;
1094
1095   while (len > 0 && scm_is_pair (rest))
1096     {
1097       SCM elt = SCM_CAR (rest);
1098       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1099       if (SCM_CHAR (elt) > 0xFF)
1100         wide = 1;
1101       rest = SCM_CDR (rest);
1102       len--;
1103       scm_remember_upto_here_1 (elt);
1104     }
1105
1106   /* Construct a string containing this list of chars.  */
1107   len = (size_t) i;
1108   rest = chrs;
1109
1110   if (wide == 0)
1111     {
1112       char *buf;
1113
1114       result = scm_i_make_string (len, NULL, 0);
1115       result = scm_i_string_start_writing (result);
1116       buf = scm_i_string_writable_chars (result);
1117       while (len > 0 && scm_is_pair (rest))
1118         {
1119           SCM elt = SCM_CAR (rest);
1120           buf[p] = (unsigned char) SCM_CHAR (elt);
1121           p++;
1122           rest = SCM_CDR (rest);
1123           len--;
1124           scm_remember_upto_here_1 (elt);
1125         }
1126     }
1127   else
1128     {
1129       scm_t_wchar *buf;
1130
1131       result = scm_i_make_wide_string (len, NULL, 0);
1132       result = scm_i_string_start_writing (result);
1133       buf = scm_i_string_writable_wide_chars (result);
1134       while (len > 0 && scm_is_pair (rest))
1135         {
1136           SCM elt = SCM_CAR (rest);
1137           buf[p] = SCM_CHAR (elt);
1138           p++;
1139           rest = SCM_CDR (rest);
1140           len--;
1141           scm_remember_upto_here_1 (elt);
1142         }
1143     }
1144   scm_i_string_stop_writing ();
1145
1146   if (len > 0)
1147     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1148   if (!scm_is_null (rest))
1149     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1150
1151   return result;
1152 }
1153 #undef FUNC_NAME
1154
1155 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1156             (SCM k, SCM chr),
1157             "Return a newly allocated string of\n"
1158             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1159             "the string are initialized to @var{chr}, otherwise the contents\n"
1160             "of the @var{string} are all set to @var{#\nul}.")
1161 #define FUNC_NAME s_scm_make_string
1162 {
1163   return scm_c_make_string (scm_to_size_t (k), chr);
1164 }
1165 #undef FUNC_NAME
1166
1167 SCM
1168 scm_c_make_string (size_t len, SCM chr)
1169 #define FUNC_NAME NULL
1170 {
1171   size_t p;
1172   char *contents = NULL;
1173   SCM res = scm_i_make_string (len, &contents, 0);
1174
1175   /* If no char is given, initialize string contents to NULL.  */
1176   if (SCM_UNBNDP (chr))
1177     memset (contents, 0, len);
1178   else
1179     {
1180       SCM_VALIDATE_CHAR (0, chr);
1181       res = scm_i_string_start_writing (res);
1182       for (p = 0; p < len; p++)
1183         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1184       scm_i_string_stop_writing ();
1185     }
1186
1187   return res;
1188 }
1189 #undef FUNC_NAME
1190
1191 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1192             (SCM string),
1193             "Return the number of characters in @var{string}.")
1194 #define FUNC_NAME s_scm_string_length
1195 {
1196   SCM_VALIDATE_STRING (1, string);
1197   return scm_from_size_t (STRING_LENGTH (string));
1198 }
1199 #undef FUNC_NAME
1200
1201 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1202             (SCM string),
1203             "Return the bytes used to represent a character in @var{string}."
1204             "This will return 1 or 4.")
1205 #define FUNC_NAME s_scm_string_bytes_per_char
1206 {
1207   SCM_VALIDATE_STRING (1, string);
1208   if (!scm_i_is_narrow_string (string))
1209     return scm_from_int (4);
1210
1211   return scm_from_int (1);
1212 }
1213 #undef FUNC_NAME
1214
1215 size_t
1216 scm_c_string_length (SCM string)
1217 {
1218   if (!IS_STRING (string))
1219     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1220   return STRING_LENGTH (string);
1221 }
1222
1223 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1224             (SCM str, SCM k),
1225             "Return character @var{k} of @var{str} using zero-origin\n"
1226             "indexing. @var{k} must be a valid index of @var{str}.")
1227 #define FUNC_NAME s_scm_string_ref
1228 {
1229   size_t len;
1230   unsigned long idx;
1231
1232   SCM_VALIDATE_STRING (1, str);
1233
1234   len = scm_i_string_length (str);
1235   if (SCM_LIKELY (len > 0))
1236     idx = scm_to_unsigned_integer (k, 0, len - 1);
1237   else
1238     scm_out_of_range (NULL, k);
1239
1240   if (scm_i_is_narrow_string (str))
1241     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1242   else
1243     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1244 }
1245 #undef FUNC_NAME
1246
1247 SCM
1248 scm_c_string_ref (SCM str, size_t p)
1249 {
1250   if (p >= scm_i_string_length (str))
1251     scm_out_of_range (NULL, scm_from_size_t (p));
1252   if (scm_i_is_narrow_string (str))
1253     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1254   else
1255     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1256
1257 }
1258
1259 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1260             (SCM str, SCM k, SCM chr),
1261             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1262             "an unspecified value. @var{k} must be a valid index of\n"
1263             "@var{str}.")
1264 #define FUNC_NAME s_scm_string_set_x
1265 {
1266   size_t len;
1267   unsigned long idx;
1268
1269   SCM_VALIDATE_STRING (1, str);
1270
1271   len = scm_i_string_length (str);
1272   if (SCM_LIKELY (len > 0))
1273     idx = scm_to_unsigned_integer (k, 0, len - 1);
1274   else
1275     scm_out_of_range (NULL, k);
1276
1277   SCM_VALIDATE_CHAR (3, chr);
1278   str = scm_i_string_start_writing (str);
1279   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1280   scm_i_string_stop_writing ();
1281
1282   return SCM_UNSPECIFIED;
1283 }
1284 #undef FUNC_NAME
1285
1286 void
1287 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1288 {
1289   if (p >= scm_i_string_length (str))
1290     scm_out_of_range (NULL, scm_from_size_t (p));
1291   str = scm_i_string_start_writing (str);
1292   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1293   scm_i_string_stop_writing ();
1294 }
1295
1296 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1297             (SCM str, SCM start, SCM end),
1298             "Return a newly allocated string formed from the characters\n"
1299             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1300             "ending with index @var{end} (exclusive).\n"
1301             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1302             "exact integers satisfying:\n\n"
1303             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1304 #define FUNC_NAME s_scm_substring
1305 {
1306   size_t len, from, to;
1307
1308   SCM_VALIDATE_STRING (1, str);
1309   len = scm_i_string_length (str);
1310   from = scm_to_unsigned_integer (start, 0, len);
1311   if (SCM_UNBNDP (end))
1312     to = len;
1313   else
1314     to = scm_to_unsigned_integer (end, from, len);
1315   return scm_i_substring (str, from, to);
1316 }
1317 #undef FUNC_NAME
1318
1319 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1320             (SCM str, SCM start, SCM end),
1321             "Return a newly allocated string formed from the characters\n"
1322             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323             "ending with index @var{end} (exclusive).\n"
1324             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325             "exact integers satisfying:\n"
1326             "\n"
1327             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1328             "\n"
1329             "The returned string is read-only.\n")
1330 #define FUNC_NAME s_scm_substring_read_only
1331 {
1332   size_t len, from, to;
1333
1334   SCM_VALIDATE_STRING (1, str);
1335   len = scm_i_string_length (str);
1336   from = scm_to_unsigned_integer (start, 0, len);
1337   if (SCM_UNBNDP (end))
1338     to = len;
1339   else
1340     to = scm_to_unsigned_integer (end, from, len);
1341   return scm_i_substring_read_only (str, from, to);
1342 }
1343 #undef FUNC_NAME
1344
1345 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1346             (SCM str, SCM start, SCM end),
1347             "Return a newly allocated string formed from the characters\n"
1348             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1349             "ending with index @var{end} (exclusive).\n"
1350             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1351             "exact integers satisfying:\n\n"
1352             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1353 #define FUNC_NAME s_scm_substring_copy
1354 {
1355   /* For the Scheme version, START is mandatory, but for the C
1356      version, it is optional.  See scm_string_copy in srfi-13.c for a
1357      rationale.
1358   */
1359
1360   size_t from, to;
1361
1362   SCM_VALIDATE_STRING (1, str);
1363   scm_i_get_substring_spec (scm_i_string_length (str),
1364                             start, &from, end, &to);
1365   return scm_i_substring_copy (str, from, to);
1366 }
1367 #undef FUNC_NAME
1368
1369 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1370             (SCM str, SCM start, SCM end),
1371             "Return string that indirectly refers to the characters\n"
1372             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1373             "ending with index @var{end} (exclusive).\n"
1374             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1375             "exact integers satisfying:\n\n"
1376             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1377 #define FUNC_NAME s_scm_substring_shared
1378 {
1379   size_t len, from, to;
1380
1381   SCM_VALIDATE_STRING (1, str);
1382   len = scm_i_string_length (str);
1383   from = scm_to_unsigned_integer (start, 0, len);
1384   if (SCM_UNBNDP (end))
1385     to = len;
1386   else
1387     to = scm_to_unsigned_integer (end, from, len);
1388   return scm_i_substring_shared (str, from, to);
1389 }
1390 #undef FUNC_NAME
1391
1392 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1393             (SCM args),
1394             "Return a newly allocated string whose characters form the\n"
1395             "concatenation of the given strings, @var{args}.")
1396 #define FUNC_NAME s_scm_string_append
1397 {
1398   SCM res;
1399   size_t len = 0;
1400   int wide = 0;
1401   SCM l, s;
1402   size_t i;
1403   union
1404   {
1405     char *narrow;
1406     scm_t_wchar *wide;
1407   } data;
1408
1409   SCM_VALIDATE_REST_ARGUMENT (args);
1410   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1411     {
1412       s = SCM_CAR (l);
1413       SCM_VALIDATE_STRING (SCM_ARGn, s);
1414       len += scm_i_string_length (s);
1415       if (!scm_i_is_narrow_string (s))
1416         wide = 1;
1417     }
1418   data.narrow = NULL;
1419   if (!wide)
1420     res = scm_i_make_string (len, &data.narrow, 0);
1421   else
1422     res = scm_i_make_wide_string (len, &data.wide, 0);
1423
1424   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1425     {
1426       size_t len;
1427       s = SCM_CAR (l);
1428       SCM_VALIDATE_STRING (SCM_ARGn, s);
1429       len = scm_i_string_length (s);
1430       if (!wide)
1431         {
1432           memcpy (data.narrow, scm_i_string_chars (s), len);
1433           data.narrow += len;
1434         }
1435       else
1436         {
1437           if (scm_i_is_narrow_string (s))
1438             {
1439               for (i = 0; i < scm_i_string_length (s); i++)
1440                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1441             }
1442           else
1443             u32_cpy ((scm_t_uint32 *) data.wide,
1444                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1445           data.wide += len;
1446         }
1447       scm_remember_upto_here_1 (s);
1448     }
1449   return res;
1450 }
1451 #undef FUNC_NAME
1452
1453
1454 \f
1455 /* Charset conversion error handling.  */
1456
1457 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1458 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1459
1460 /* Raise an exception informing that character CHR could not be written
1461    to PORT in its current encoding.  */
1462 void
1463 scm_encoding_error (const char *subr, int err, const char *message,
1464                     SCM port, SCM chr)
1465 {
1466   scm_throw (scm_encoding_error_key,
1467              scm_list_n (scm_from_latin1_string (subr),
1468                          scm_from_latin1_string (message),
1469                          scm_from_int (err),
1470                          port, chr,
1471                          SCM_UNDEFINED));
1472 }
1473
1474 /* Raise an exception informing of an encoding error on PORT.  This
1475    means that a character could not be written in PORT's encoding.  */
1476 void
1477 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1478 {
1479   scm_throw (scm_decoding_error_key,
1480              scm_list_n (scm_from_latin1_string (subr),
1481                          scm_from_latin1_string (message),
1482                          scm_from_int (err),
1483                          port,
1484                          SCM_UNDEFINED));
1485 }
1486
1487 \f
1488 /* String conversion to/from C.  */
1489
1490 static void
1491 decoding_error (const char *func_name, int errno_save,
1492                 const char *str, size_t len)
1493 {
1494   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1495      handler.  */
1496   SCM bv;
1497   signed char *buf;
1498
1499   buf = scm_gc_malloc_pointerless (len, "bytevector");
1500   memcpy (buf, str, len);
1501   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1502
1503   scm_decoding_error (func_name, errno_save,
1504                       "input locale conversion error", bv);
1505 }
1506
1507 SCM
1508 scm_from_stringn (const char *str, size_t len, const char *encoding,
1509                   scm_t_string_failed_conversion_handler handler)
1510 {
1511   size_t u32len, i;
1512   scm_t_wchar *u32;
1513   int wide = 0;
1514   SCM res;
1515
1516   /* The order of these checks is important. */
1517   if (!str && len != 0)
1518     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1519   if (len == (size_t) -1)
1520     len = strlen (str);
1521
1522   if (encoding == NULL || len == 0)
1523     {
1524       /* If encoding is null (or the string is empty), use Latin-1.  */
1525       char *buf;
1526       res = scm_i_make_string (len, &buf, 0);
1527       memcpy (buf, str, len);
1528       return res;
1529     }
1530
1531   u32len = 0;
1532   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1533                                                 (enum iconv_ilseq_handler)
1534                                                 handler,
1535                                                 str, len,
1536                                                 NULL,
1537                                                 NULL, &u32len);
1538
1539   if (SCM_UNLIKELY (u32 == NULL))
1540     decoding_error (__func__, errno, str, len);
1541
1542   i = 0;
1543   while (i < u32len)
1544     if (u32[i++] > 0xFF)
1545       {
1546         wide = 1;
1547         break;
1548       }
1549
1550   if (!wide)
1551     {
1552       char *dst;
1553       res = scm_i_make_string (u32len, &dst, 0);
1554       for (i = 0; i < u32len; i ++)
1555         dst[i] = (unsigned char) u32[i];
1556       dst[u32len] = '\0';
1557     }
1558   else
1559     {
1560       scm_t_wchar *wdst;
1561       res = scm_i_make_wide_string (u32len, &wdst, 0);
1562       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1563       wdst[u32len] = 0;
1564     }
1565
1566   free (u32);
1567   return res;
1568 }
1569
1570 SCM
1571 scm_from_locale_string (const char *str)
1572 {
1573   return scm_from_locale_stringn (str, -1);
1574 }
1575
1576 SCM
1577 scm_from_locale_stringn (const char *str, size_t len)
1578 {
1579   return scm_from_stringn (str, len, locale_charset (),
1580                            scm_i_get_conversion_strategy (SCM_BOOL_F));
1581 }
1582
1583 SCM
1584 scm_from_latin1_string (const char *str)
1585 {
1586   return scm_from_latin1_stringn (str, -1);
1587 }
1588
1589 SCM
1590 scm_from_latin1_stringn (const char *str, size_t len)
1591 {
1592   char *buf;
1593   SCM result;
1594
1595   if (len == (size_t) -1)
1596     len = strlen (str);
1597
1598   /* Make a narrow string and copy STR as is.  */
1599   result = scm_i_make_string (len, &buf, 0);
1600   memcpy (buf, str, len);
1601
1602   return result;
1603 }
1604
1605 SCM
1606 scm_from_utf8_string (const char *str)
1607 {
1608   return scm_from_utf8_stringn (str, -1);
1609 }
1610
1611 SCM
1612 scm_from_utf8_stringn (const char *str, size_t len)
1613 {
1614   size_t i, char_len;
1615   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1616   int ascii = 1, narrow = 1;
1617   SCM res;
1618
1619   if (len == (size_t) -1)
1620     len = strlen (str);
1621
1622   i = 0;
1623   char_len = 0;
1624
1625   while (i < len)
1626     {
1627       if (ustr[i] <= 127)
1628         {
1629           char_len++;
1630           i++;
1631         }
1632       else
1633         {
1634           ucs4_t c;
1635           int nbytes;
1636
1637           ascii = 0;
1638
1639           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1640
1641           if (nbytes < 0)
1642             /* Bad UTF-8.  */
1643             decoding_error (__func__, errno, str, len);
1644
1645           if (c > 255)
1646             narrow = 0;
1647
1648           char_len++;
1649           i += nbytes;
1650         }
1651     }
1652
1653   if (ascii)
1654     {
1655       char *dst;
1656       res = scm_i_make_string (char_len, &dst, 0);
1657       memcpy (dst, str, len);
1658     }
1659   else if (narrow)
1660     {
1661       char *dst;
1662       size_t j;
1663       ucs4_t c;
1664
1665       res = scm_i_make_string (char_len, &dst, 0);
1666
1667       for (i = 0, j = 0; i < len; i++, j++)
1668         {
1669           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1670           dst[j] = (signed char) c;
1671         }
1672     }
1673   else
1674     {
1675       scm_t_wchar *dst;
1676       size_t j;
1677       ucs4_t c;
1678
1679       res = scm_i_make_wide_string (char_len, &dst, 0);
1680
1681       for (i = 0, j = 0; i < len; i++, j++)
1682         {
1683           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1684           dst[j] = c;
1685         }
1686     }
1687
1688   return res;
1689 }
1690
1691 SCM
1692 scm_from_utf32_string (const scm_t_wchar *str)
1693 {
1694   return scm_from_utf32_stringn (str, -1);
1695 }
1696
1697 SCM
1698 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1699 {
1700   SCM result;
1701   scm_t_wchar *buf;
1702
1703   if (len == (size_t) -1)
1704     len = u32_strlen ((uint32_t *) str);
1705
1706   result = scm_i_make_wide_string (len, &buf, 0);
1707   memcpy (buf, str, len * sizeof (scm_t_wchar));
1708   scm_i_try_narrow_string (result);
1709
1710   return result;
1711 }
1712
1713 /* Create a new scheme string from the C string STR.  The memory of
1714    STR may be used directly as storage for the new string.  */
1715 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1716    would be to register a finalizer to eventually free(3) STR, which isn't
1717    worth it.  Should we just deprecate the `scm_take_' functions?  */
1718 SCM
1719 scm_take_locale_stringn (char *str, size_t len)
1720 {
1721   SCM res;
1722
1723   res = scm_from_locale_stringn (str, len);
1724   free (str);
1725
1726   return res;
1727 }
1728
1729 SCM
1730 scm_take_locale_string (char *str)
1731 {
1732   return scm_take_locale_stringn (str, -1);
1733 }
1734
1735 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1736    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1737    Set *LENP to the size of the resulting string.
1738
1739    FIXME: This is a hack we should get rid of.  See
1740    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1741    for details.  */
1742 static void
1743 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1744 {
1745   char *before, *after;
1746   size_t i, j;
1747
1748   before = buf;
1749   after = buf;
1750   i = 0;
1751   j = 0;
1752   while (i < *lenp)
1753     {
1754       if ((i <= *lenp - 6)
1755           && before[i] == '\\'
1756           && before[i + 1] == 'u'
1757           && before[i + 2] == '0' && before[i + 3] == '0')
1758         {
1759           /* Convert \u00NN to \xNN */
1760           after[j] = '\\';
1761           after[j + 1] = 'x';
1762           after[j + 2] = tolower ((int) before[i + 4]);
1763           after[j + 3] = tolower ((int) before[i + 5]);
1764           i += 6;
1765           j += 4;
1766         }
1767       else if ((i <= *lenp - 10)
1768                && before[i] == '\\'
1769                && before[i + 1] == 'U'
1770                && before[i + 2] == '0' && before[i + 3] == '0')
1771         {
1772           /* Convert \U00NNNNNN to \UNNNNNN */
1773           after[j] = '\\';
1774           after[j + 1] = 'U';
1775           after[j + 2] = tolower ((int) before[i + 4]);
1776           after[j + 3] = tolower ((int) before[i + 5]);
1777           after[j + 4] = tolower ((int) before[i + 6]);
1778           after[j + 5] = tolower ((int) before[i + 7]);
1779           after[j + 6] = tolower ((int) before[i + 8]);
1780           after[j + 7] = tolower ((int) before[i + 9]);
1781           i += 10;
1782           j += 8;
1783         }
1784       else
1785         {
1786           after[j] = before[i];
1787           i++;
1788           j++;
1789         }
1790     }
1791   *lenp = j;
1792 }
1793
1794 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1795    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1796    of the resulting string.  BUF must be large enough to handle the
1797    worst case when `\uXXXX' escapes (6 characters) are replaced by
1798    `\xXXXX;' (7 characters).  */
1799 static void
1800 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1801 {
1802   char *before, *after;
1803   size_t i, j;
1804   /* The worst case is if the input string contains all 4-digit hex escapes.
1805      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1806   size_t max_out_len = (*lenp * 7) / 6 + 1;
1807   size_t nzeros, ndigits;
1808
1809   before = buf;
1810   after = alloca (max_out_len);
1811   i = 0;
1812   j = 0;
1813   while (i < *lenp)
1814     {
1815       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1816           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1817         {
1818           if (before[i + 1] == 'u')
1819             ndigits = 4;
1820           else if (before[i + 1] == 'U')
1821             ndigits = 8;
1822           else
1823             abort ();
1824
1825           /* Add the R6RS hex escape initial sequence.  */
1826           after[j] = '\\';
1827           after[j + 1] = 'x';
1828
1829           /* Move string positions to the start of the hex numbers.  */
1830           i += 2;
1831           j += 2;
1832
1833           /* Find the number of initial zeros in this hex number.  */
1834           nzeros = 0;
1835           while (before[i + nzeros] == '0' && nzeros < ndigits)
1836             nzeros++;
1837
1838           /* Copy the number, skipping initial zeros, and then move the string
1839              positions.  */
1840           if (nzeros == ndigits)
1841             {
1842               after[j] = '0';
1843               i += ndigits;
1844               j += 1;
1845             }
1846           else
1847             {
1848               int pos;
1849               for (pos = 0; pos < ndigits - nzeros; pos++)
1850                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1851               i += ndigits;
1852               j += (ndigits - nzeros);
1853             }
1854
1855           /* Add terminating semicolon.  */
1856           after[j] = ';';
1857           j++;
1858         }
1859       else
1860         {
1861           after[j] = before[i];
1862           i++;
1863           j++;
1864         }
1865     }
1866   *lenp = j;
1867   memcpy (before, after, j);
1868 }
1869
1870 char *
1871 scm_to_locale_string (SCM str)
1872 {
1873   return scm_to_locale_stringn (str, NULL);
1874 }
1875
1876 char *
1877 scm_to_locale_stringn (SCM str, size_t *lenp)
1878 {
1879   return scm_to_stringn (str, lenp,
1880                          locale_charset (),
1881                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1882 }
1883
1884 char *
1885 scm_to_latin1_string (SCM str)
1886 {
1887   return scm_to_latin1_stringn (str, NULL);
1888 }
1889
1890 char *
1891 scm_to_latin1_stringn (SCM str, size_t *lenp)
1892 #define FUNC_NAME "scm_to_latin1_stringn"
1893 {
1894   char *result;
1895
1896   SCM_VALIDATE_STRING (1, str);
1897
1898   if (scm_i_is_narrow_string (str))
1899     {
1900       size_t len = scm_i_string_length (str);
1901
1902       if (lenp)
1903         *lenp = len;
1904
1905       result = scm_strndup (scm_i_string_data (str), len);
1906     }
1907   else
1908     result = scm_to_stringn (str, lenp, NULL,
1909                              SCM_FAILED_CONVERSION_ERROR);
1910
1911   return result;
1912 }
1913 #undef FUNC_NAME
1914
1915 char *
1916 scm_to_utf8_string (SCM str)
1917 {
1918   return scm_to_utf8_stringn (str, NULL);
1919 }
1920
1921 char *
1922 scm_to_utf8_stringn (SCM str, size_t *lenp)
1923 {
1924   return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1925 }
1926
1927 scm_t_wchar *
1928 scm_to_utf32_string (SCM str)
1929 {
1930   return scm_to_utf32_stringn (str, NULL);
1931 }
1932
1933 scm_t_wchar *
1934 scm_to_utf32_stringn (SCM str, size_t *lenp)
1935 #define FUNC_NAME "scm_to_utf32_stringn"
1936 {
1937   scm_t_wchar *result;
1938
1939   SCM_VALIDATE_STRING (1, str);
1940
1941   if (scm_i_is_narrow_string (str))
1942     result = (scm_t_wchar *)
1943       scm_to_stringn (str, lenp, "UTF-32",
1944                       SCM_FAILED_CONVERSION_ERROR);
1945   else
1946     {
1947       size_t len;
1948
1949       len = scm_i_string_length (str);
1950       if (lenp)
1951         *lenp = len;
1952
1953       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1954       memcpy (result, scm_i_string_wide_chars (str),
1955               len * sizeof (scm_t_wchar));
1956       result[len] = 0;
1957     }
1958
1959   return result;
1960 }
1961 #undef FUNC_NAME
1962
1963 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1964    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
1965    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
1966    defined by HANDLER.  */
1967 char *
1968 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1969                 scm_t_string_failed_conversion_handler handler)
1970 {
1971   char *buf;
1972   size_t ilen, len, i;
1973   int ret;
1974   const char *enc;
1975
1976   if (!scm_is_string (str))
1977     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1978   ilen = scm_i_string_length (str);
1979
1980   if (ilen == 0)
1981     {
1982       buf = scm_malloc (1);
1983       buf[0] = '\0';
1984       if (lenp)
1985         *lenp = 0;
1986       return buf;
1987     }
1988
1989   if (lenp == NULL)
1990     for (i = 0; i < ilen; i++)
1991       if (scm_i_string_ref (str, i) == '\0')
1992         scm_misc_error (NULL,
1993                         "string contains #\\nul character: ~S",
1994                         scm_list_1 (str));
1995
1996   if (scm_i_is_narrow_string (str) && (encoding == NULL))
1997     {
1998       /* If using native Latin-1 encoding, just copy the string
1999          contents.  */
2000       if (lenp)
2001         {
2002           buf = scm_malloc (ilen);
2003           memcpy (buf, scm_i_string_chars (str), ilen);
2004           *lenp = ilen;
2005           return buf;
2006         }
2007       else
2008         {
2009           buf = scm_malloc (ilen + 1);
2010           memcpy (buf, scm_i_string_chars (str), ilen);
2011           buf[ilen] = '\0';
2012           return buf;
2013         }
2014     }
2015
2016
2017   buf = NULL;
2018   len = 0;
2019   enc = encoding;
2020   if (enc == NULL)
2021     enc = "ISO-8859-1";
2022   if (scm_i_is_narrow_string (str))
2023     {
2024       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2025                          "ISO-8859-1", enc,
2026                          (enum iconv_ilseq_handler) handler, NULL,
2027                          &buf, &len);
2028
2029       if (ret != 0)
2030         scm_encoding_error (__func__, errno,
2031                             "cannot convert narrow string to output locale",
2032                             SCM_BOOL_F,
2033                             /* FIXME: Faulty character unknown.  */
2034                             SCM_BOOL_F);
2035     }
2036   else
2037     {
2038       buf = u32_conv_to_encoding (enc,
2039                                   (enum iconv_ilseq_handler) handler,
2040                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2041                                   ilen,
2042                                   NULL,
2043                                   NULL, &len);
2044       if (buf == NULL)
2045         scm_encoding_error (__func__, errno,
2046                             "cannot convert wide string to output locale",
2047                             SCM_BOOL_F,
2048                             /* FIXME: Faulty character unknown.  */
2049                             SCM_BOOL_F);
2050     }
2051   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2052     {
2053       if (SCM_R6RS_ESCAPES_P)
2054         {
2055           /* The worst case is if the input string contains all 4-digit
2056              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2057              (seven characters).  Make BUF large enough to hold
2058              that.  */
2059           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2060           unistring_escapes_to_r6rs_escapes (buf, &len);
2061         }
2062       else
2063         unistring_escapes_to_guile_escapes (buf, &len);
2064
2065       buf = scm_realloc (buf, len);
2066     }
2067   if (lenp)
2068     *lenp = len;
2069   else
2070     {
2071       buf = scm_realloc (buf, len + 1);
2072       buf[len] = '\0';
2073     }
2074
2075   scm_remember_upto_here_1 (str);
2076   return buf;
2077 }
2078
2079 size_t
2080 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2081 {
2082   size_t len;
2083   char *result = NULL;
2084   if (!scm_is_string (str))
2085     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2086   result = scm_to_locale_stringn (str, &len);
2087
2088   memcpy (buf, result, (len > max_len) ? max_len : len);
2089   free (result);
2090
2091   scm_remember_upto_here_1 (str);
2092   return len;
2093 }
2094
2095 \f
2096 /* Unicode string normalization.  */
2097
2098 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2099    libguile/i18n.c.  It would be useful to have this factored out into a more
2100    convenient location, but its use of alloca makes that tricky to do. */
2101
2102 static SCM
2103 normalize_str (SCM string, uninorm_t form)
2104 {
2105   SCM ret;
2106   scm_t_uint32 *w_str;
2107   scm_t_wchar *cbuf;
2108   size_t rlen, len = scm_i_string_length (string);
2109
2110   if (scm_i_is_narrow_string (string))
2111     {
2112       size_t i;
2113       const char *buf = scm_i_string_chars (string);
2114
2115       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2116
2117       for (i = 0; i < len; i ++)
2118         w_str[i] = (unsigned char) buf[i];
2119       w_str[len] = 0;
2120     }
2121   else
2122     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2123
2124   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2125
2126   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2127   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2128   free (w_str);
2129
2130   scm_i_try_narrow_string (ret);
2131
2132   return ret;
2133 }
2134
2135 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2136             (SCM string),
2137             "Returns the NFC normalized form of @var{string}.")
2138 #define FUNC_NAME s_scm_string_normalize_nfc
2139 {
2140   SCM_VALIDATE_STRING (1, string);
2141   return normalize_str (string, UNINORM_NFC);
2142 }
2143 #undef FUNC_NAME
2144
2145 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2146             (SCM string),
2147             "Returns the NFD normalized form of @var{string}.")
2148 #define FUNC_NAME s_scm_string_normalize_nfd
2149 {
2150   SCM_VALIDATE_STRING (1, string);
2151   return normalize_str (string, UNINORM_NFD);
2152 }
2153 #undef FUNC_NAME
2154
2155 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2156             (SCM string),
2157             "Returns the NFKC normalized form of @var{string}.")
2158 #define FUNC_NAME s_scm_string_normalize_nfkc
2159 {
2160   SCM_VALIDATE_STRING (1, string);
2161   return normalize_str (string, UNINORM_NFKC);
2162 }
2163 #undef FUNC_NAME
2164
2165 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2166             (SCM string),
2167             "Returns the NFKD normalized form of @var{string}.")
2168 #define FUNC_NAME s_scm_string_normalize_nfkd
2169 {
2170   SCM_VALIDATE_STRING (1, string);
2171   return normalize_str (string, UNINORM_NFKD);
2172 }
2173 #undef FUNC_NAME
2174
2175 /* converts C scm_array of strings to SCM scm_list of strings.
2176    If argc < 0, a null terminated scm_array is assumed.
2177    The current locale encoding is assumed */
2178 SCM
2179 scm_makfromstrs (int argc, char **argv)
2180 {
2181   int i = argc;
2182   SCM lst = SCM_EOL;
2183   if (0 > i)
2184     for (i = 0; argv[i]; i++);
2185   while (i--)
2186     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2187   return lst;
2188 }
2189
2190 /* Return a newly allocated array of char pointers to each of the strings
2191    in args, with a terminating NULL pointer.  The strings are encoded using
2192    the current locale. */
2193
2194 char **
2195 scm_i_allocate_string_pointers (SCM list)
2196 #define FUNC_NAME "scm_i_allocate_string_pointers"
2197 {
2198   char **result;
2199   int list_len = scm_ilength (list);
2200   int i;
2201
2202   if (list_len < 0)
2203     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2204
2205   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2206                           "string pointers");
2207   result[list_len] = NULL;
2208
2209   /* The list might have been modified in another thread, so
2210      we check LIST before each access.
2211    */
2212   for (i = 0; i < list_len && scm_is_pair (list); i++)
2213     {
2214       SCM str = SCM_CAR (list);
2215       size_t len;  /* String length in bytes */
2216       char *c_str = scm_to_locale_stringn (str, &len);
2217
2218       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2219          scm_malloc to allocate the returned string, which must be
2220          explicitly deallocated.  This forces us to copy the string a
2221          second time into a new buffer.  Ideally there would be variants
2222          of scm_to_*_stringn that can return garbage-collected buffers. */
2223
2224       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2225       memcpy (result[i], c_str, len);
2226       result[i][len] = '\0';
2227       free (c_str);
2228
2229       list = SCM_CDR (list);
2230     }
2231
2232   return result;
2233 }
2234 #undef FUNC_NAME
2235
2236 void
2237 scm_i_get_substring_spec (size_t len,
2238                           SCM start, size_t *cstart,
2239                           SCM end, size_t *cend)
2240 {
2241   if (SCM_UNBNDP (start))
2242     *cstart = 0;
2243   else
2244     *cstart = scm_to_unsigned_integer (start, 0, len);
2245
2246   if (SCM_UNBNDP (end))
2247     *cend = len;
2248   else
2249     *cend = scm_to_unsigned_integer (end, *cstart, len);
2250 }
2251
2252 static SCM
2253 string_handle_ref (scm_t_array_handle *h, size_t index)
2254 {
2255   return scm_c_string_ref (h->array, index);
2256 }
2257
2258 static void
2259 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2260 {
2261   scm_c_string_set_x (h->array, index, val);
2262 }
2263
2264 static void
2265 string_get_handle (SCM v, scm_t_array_handle *h)
2266 {
2267   h->array = v;
2268   h->ndims = 1;
2269   h->dims = &h->dim0;
2270   h->dim0.lbnd = 0;
2271   h->dim0.ubnd = scm_c_string_length (v) - 1;
2272   h->dim0.inc = 1;
2273   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2274   h->elements = h->writable_elements = NULL;
2275 }
2276
2277 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2278                           string_handle_ref, string_handle_set,
2279                           string_get_handle)
2280 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2281
2282 void
2283 scm_init_strings ()
2284 {
2285   scm_nullstr = scm_i_make_string (0, NULL, 0);
2286
2287 #include "libguile/strings.x"
2288 }
2289
2290
2291 /*
2292   Local Variables:
2293   c-file-style: "gnu"
2294   End:
2295 */