libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 229
 230 \f
 231 /* Copy-on-write strings.
 232  */
 233
 234 #define STRING_TAG            scm_tc7_string
 235
 236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 237 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 238 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 239
 240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 242
 243 #define IS_STRING(str)        (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
 244
 245 /* Read-only strings.
 246  */
 247
 248 #define RO_STRING_TAG         scm_tc7_ro_string
 249 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 250
 251 /* Mutation-sharing substrings
 252  */
 253
 254 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 255
 256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 257 /* START and LENGTH as for STRINGs. */
 258
 259 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 260
 261 SCM scm_nullstr;
 262
 263 static SCM null_stringbuf;
 264
 265 static void
 266 init_null_stringbuf (void)
 267 {
 268   null_stringbuf = make_stringbuf (0);
 269   SET_STRINGBUF_SHARED (null_stringbuf);
 270 }
 271
 272 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 273    characters.  CHARSP, if not NULL, will be set to location of the
 274    char array.  If READ_ONLY_P, the returned string is read-only;
 275    otherwise it is writable.  */
 276 SCM
 277 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 278 {
 279   SCM buf;
 280   SCM res;
 281
 282   if (len == 0)
 283     {
 284       static scm_i_pthread_once_t once = SCM_I_PTHREAD_ONCE_INIT;
 285       scm_i_pthread_once (&once, init_null_stringbuf);
 286       buf = null_stringbuf;
 287     }
 288   else
 289     buf = make_stringbuf (len);
 290
 291   if (charsp)
 292     *charsp = (char *) STRINGBUF_CHARS (buf);
 293   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 294                          SCM_UNPACK (buf),
 295                          (scm_t_bits) 0, (scm_t_bits) len);
 296   return res;
 297 }
 298
 299 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 300    characters.  CHARSP, if not NULL, will be set to location of the
 301    character array.  If READ_ONLY_P, the returned string is read-only;
 302    otherwise it is writable.  */
 303 SCM
 304 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 305 {
 306   SCM buf = make_wide_stringbuf (len);
 307   SCM res;
 308   if (charsp)
 309     *charsp = STRINGBUF_WIDE_CHARS (buf);
 310   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 311                          SCM_UNPACK (buf),
 312                          (scm_t_bits) 0, (scm_t_bits) len);
 313   return res;
 314 }
 315
 316 static void
 317 validate_substring_args (SCM str, size_t start, size_t end)
 318 {
 319   if (!IS_STRING (str))
 320     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 321   if (start > STRING_LENGTH (str))
 322     scm_out_of_range (NULL, scm_from_size_t (start));
 323   if (end > STRING_LENGTH (str) || end < start)
 324     scm_out_of_range (NULL, scm_from_size_t (end));
 325 }
 326
 327 static inline void
 328 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 329 {
 330   *start = STRING_START (*str);
 331   if (IS_SH_STRING (*str))
 332     {
 333       *str = SH_STRING_STRING (*str);
 334       *start += STRING_START (*str);
 335     }
 336   *buf = STRING_STRINGBUF (*str);
 337 }
 338
 339 SCM
 340 scm_i_substring (SCM str, size_t start, size_t end)
 341 {
 342   if (start == end)
 343     return scm_i_make_string (0, NULL, 0);
 344   else
 345     {
 346       SCM buf;
 347       size_t str_start;
 348       get_str_buf_start (&str, &buf, &str_start);
 349       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 350       SET_STRINGBUF_SHARED (buf);
 351       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 352       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 353                               (scm_t_bits)str_start + start,
 354                               (scm_t_bits) end - start);
 355     }
 356 }
 357
 358 SCM
 359 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 360 {
 361   if (start == end)
 362     return scm_i_make_string (0, NULL, 1);
 363   else
 364     {
 365       SCM buf;
 366       size_t str_start;
 367       get_str_buf_start (&str, &buf, &str_start);
 368       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 369       SET_STRINGBUF_SHARED (buf);
 370       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 371       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 372                               (scm_t_bits)str_start + start,
 373                               (scm_t_bits) end - start);
 374     }
 375 }
 376
 377 SCM
 378 scm_i_substring_copy (SCM str, size_t start, size_t end)
 379 {
 380   if (start == end)
 381     return scm_i_make_string (0, NULL, 0);
 382   else
 383     {
 384       size_t len = end - start;
 385       SCM buf, my_buf, substr;
 386       size_t str_start;
 387       int wide = 0;
 388       get_str_buf_start (&str, &buf, &str_start);
 389       if (scm_i_is_narrow_string (str))
 390         {
 391           my_buf = make_stringbuf (len);
 392           memcpy (STRINGBUF_CHARS (my_buf),
 393                   STRINGBUF_CHARS (buf) + str_start + start, len);
 394         }
 395       else
 396         {
 397           my_buf = make_wide_stringbuf (len);
 398           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 399                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 400                                      + start), len);
 401           wide = 1;
 402         }
 403       scm_remember_upto_here_1 (buf);
 404       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 405                                 (scm_t_bits) 0, (scm_t_bits) len);
 406       if (wide)
 407         scm_i_try_narrow_string (substr);
 408       return substr;
 409     }
 410 }
 411
 412 SCM
 413 scm_i_substring_shared (SCM str, size_t start, size_t end)
 414 {
 415   if (start == 0 && end == STRING_LENGTH (str))
 416     return str;
 417   else if (start == end)
 418     return scm_i_make_string (0, NULL, 0);
 419   else
 420     {
 421       size_t len = end - start;
 422       if (IS_SH_STRING (str))
 423         {
 424           start += STRING_START (str);
 425           str = SH_STRING_STRING (str);
 426         }
 427       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 428                               (scm_t_bits)start, (scm_t_bits) len);
 429     }
 430 }
 431
 432 SCM
 433 scm_c_substring (SCM str, size_t start, size_t end)
 434 {
 435   validate_substring_args (str, start, end);
 436   return scm_i_substring (str, start, end);
 437 }
 438
 439 SCM
 440 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 441 {
 442   validate_substring_args (str, start, end);
 443   return scm_i_substring_read_only (str, start, end);
 444 }
 445
 446 SCM
 447 scm_c_substring_copy (SCM str, size_t start, size_t end)
 448 {
 449   validate_substring_args (str, start, end);
 450   return scm_i_substring_copy (str, start, end);
 451 }
 452
 453 SCM
 454 scm_c_substring_shared (SCM str, size_t start, size_t end)
 455 {
 456   validate_substring_args (str, start, end);
 457   return scm_i_substring_shared (str, start, end);
 458 }
 459
 460 \f
 461 /* Internal accessors
 462  */
 463
 464 /* Returns the number of characters in STR.  This may be different
 465    than the memory size of the string storage.  */
 466 size_t
 467 scm_i_string_length (SCM str)
 468 {
 469   return STRING_LENGTH (str);
 470 }
 471
 472 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 473    encoding.  False if it is 'wide', having a 32-bit UCS-4
 474    encoding.  */
 475 int
 476 scm_i_is_narrow_string (SCM str)
 477 {
 478   if (IS_SH_STRING (str))
 479     str = SH_STRING_STRING (str);
 480
 481   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 482 }
 483
 484 /* Try to coerce a string to be narrow.  It if is narrow already, do
 485    nothing.  If it is wide, shrink it to narrow if none of its
 486    characters are above 0xFF.  Return true if the string is narrow or
 487    was made to be narrow.  */
 488 int
 489 scm_i_try_narrow_string (SCM str)
 490 {
 491   if (IS_SH_STRING (str))
 492     str = SH_STRING_STRING (str);
 493
 494   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 495
 496   return scm_i_is_narrow_string (str);
 497 }
 498
 499 /* Return a pointer to the raw data of the string, which can be either Latin-1
 500    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 501 const void *
 502 scm_i_string_data (SCM str)
 503 {
 504   SCM buf;
 505   size_t start;
 506   const char *data;
 507
 508   get_str_buf_start (&str, &buf, &start);
 509
 510   data = STRINGBUF_CONTENTS (buf);
 511   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 512
 513   return data;
 514 }
 515
 516 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 517    STR.  */
 518 const char *
 519 scm_i_string_chars (SCM str)
 520 {
 521   SCM buf;
 522   size_t start;
 523   get_str_buf_start (&str, &buf, &start);
 524   if (scm_i_is_narrow_string (str))
 525     return (const char *) STRINGBUF_CHARS (buf) + start;
 526   else
 527     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 528                     scm_list_1 (str));
 529   return NULL;
 530 }
 531
 532 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 533    STR.  */
 534 const scm_t_wchar *
 535 scm_i_string_wide_chars (SCM str)
 536 {
 537   SCM buf;
 538   size_t start;
 539
 540   get_str_buf_start (&str, &buf, &start);
 541   if (!scm_i_is_narrow_string (str))
 542     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 543   else
 544     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 545                     scm_list_1 (str));
 546 }
 547
 548 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 549    a new string buffer, so that it can be modified without modifying
 550    other strings.  Also, lock the string mutex.  Later, one must call
 551    scm_i_string_stop_writing to unlock the mutex.  */
 552 SCM
 553 scm_i_string_start_writing (SCM orig_str)
 554 {
 555   SCM buf, str = orig_str;
 556   size_t start;
 557
 558   get_str_buf_start (&str, &buf, &start);
 559   if (IS_RO_STRING (str))
 560     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 561
 562   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 563   if (STRINGBUF_SHARED (buf))
 564     {
 565       /* Clone the stringbuf.  */
 566       size_t len = STRING_LENGTH (str);
 567       SCM new_buf;
 568
 569       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 570
 571       if (scm_i_is_narrow_string (str))
 572         {
 573           new_buf = make_stringbuf (len);
 574           memcpy (STRINGBUF_CHARS (new_buf),
 575                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 576
 577         }
 578       else
 579         {
 580           new_buf = make_wide_stringbuf (len);
 581           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 582                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 583                                      + STRING_START (str)), len);
 584         }
 585
 586       SET_STRING_STRINGBUF (str, new_buf);
 587       start -= STRING_START (str);
 588
 589       /* FIXME: The following operations are not atomic, so other threads
 590          looking at STR may see an inconsistent state.  Nevertheless it can't
 591          hurt much since (i) accessing STR while it is being mutated can't
 592          yield a crash, and (ii) concurrent accesses to STR should be
 593          protected by a mutex at the application level.  The latter may not
 594          apply when STR != ORIG_STR, though.  */
 595       SET_STRING_START (str, 0);
 596       SET_STRING_STRINGBUF (str, new_buf);
 597
 598       buf = new_buf;
 599
 600       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 601     }
 602   return orig_str;
 603 }
 604
 605 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 606 char *
 607 scm_i_string_writable_chars (SCM str)
 608 {
 609   SCM buf;
 610   size_t start;
 611
 612   get_str_buf_start (&str, &buf, &start);
 613   if (scm_i_is_narrow_string (str))
 614     return (char *) STRINGBUF_CHARS (buf) + start;
 615   else
 616     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 617                     scm_list_1 (str));
 618   return NULL;
 619 }
 620
 621 /* Return a pointer to the UCS-4 codepoints of a string.  */
 622 static scm_t_wchar *
 623 scm_i_string_writable_wide_chars (SCM str)
 624 {
 625   SCM buf;
 626   size_t start;
 627
 628   get_str_buf_start (&str, &buf, &start);
 629   if (!scm_i_is_narrow_string (str))
 630     return STRINGBUF_WIDE_CHARS (buf) + start;
 631   else
 632     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 633                     scm_list_1 (str));
 634 }
 635
 636 /* Unlock the string mutex that was locked when
 637    scm_i_string_start_writing was called.  */
 638 void
 639 scm_i_string_stop_writing (void)
 640 {
 641   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 642 }
 643
 644 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 645 scm_t_wchar
 646 scm_i_string_ref (SCM str, size_t x)
 647 {
 648   if (scm_i_is_narrow_string (str))
 649     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 650   else
 651     return scm_i_string_wide_chars (str)[x];
 652 }
 653
 654 /* Returns index+1 of the first char in STR that matches C, or
 655    0 if the char is not found.  */
 656 int
 657 scm_i_string_contains_char (SCM str, char ch)
 658 {
 659   size_t i;
 660   size_t len = scm_i_string_length (str);
 661
 662   i = 0;
 663   if (scm_i_is_narrow_string (str))
 664     {
 665       while (i < len)
 666         {
 667           if (scm_i_string_chars (str)[i] == ch)
 668             return i+1;
 669           i++;
 670         }
 671     }
 672   else
 673     {
 674       while (i < len)
 675         {
 676           if (scm_i_string_wide_chars (str)[i]
 677               == (unsigned char) ch)
 678             return i+1;
 679           i++;
 680         }
 681     }
 682   return 0;
 683 }
 684
 685 int
 686 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 687 {
 688   if (scm_i_is_narrow_string (sstr))
 689     {
 690       const char *a = scm_i_string_chars (sstr) + start_x;
 691       const char *b = cstr;
 692       return strncmp (a, b, strlen(b));
 693     }
 694   else
 695     {
 696       size_t i;
 697       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 698       const char *b = cstr;
 699       for (i = 0; i < strlen (b); i++)
 700         {
 701           if (a[i] != (unsigned char) b[i])
 702             return 1;
 703         }
 704     }
 705   return 0;
 706 }
 707
 708 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 709 void
 710 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 711 {
 712   if (IS_SH_STRING (str))
 713     {
 714       p += STRING_START (str);
 715       str = SH_STRING_STRING (str);
 716     }
 717
 718   if (chr > 0xFF && scm_i_is_narrow_string (str))
 719     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 720
 721   if (scm_i_is_narrow_string (str))
 722     {
 723       char *dst = scm_i_string_writable_chars (str);
 724       dst[p] = chr;
 725     }
 726   else
 727     {
 728       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 729       dst[p] = chr;
 730     }
 731 }
 732
 733 \f
 734 /* Symbols.
 735
 736    Basic symbol creation and accessing is done here, the rest is in
 737    symbols.[hc].  This has been done to keep stringbufs and the
 738    internals of strings and string-like objects confined to this file.
 739 */
 740
 741 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 742
 743 SCM
 744 scm_i_make_symbol (SCM name, scm_t_bits flags,
 745                    unsigned long hash, SCM props)
 746 {
 747   SCM buf;
 748   size_t start = STRING_START (name);
 749   size_t length = STRING_LENGTH (name);
 750
 751   if (IS_SH_STRING (name))
 752     {
 753       name = SH_STRING_STRING (name);
 754       start += STRING_START (name);
 755     }
 756   buf = STRING_STRINGBUF (name);
 757
 758   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 759     {
 760       /* reuse buf. */
 761       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 762       SET_STRINGBUF_SHARED (buf);
 763       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 764     }
 765   else
 766     {
 767       /* make new buf. */
 768       if (scm_i_is_narrow_string (name))
 769         {
 770           SCM new_buf = make_stringbuf (length);
 771           memcpy (STRINGBUF_CHARS (new_buf),
 772                   STRINGBUF_CHARS (buf) + start, length);
 773           buf = new_buf;
 774         }
 775       else
 776         {
 777           SCM new_buf = make_wide_stringbuf (length);
 778           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 779                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 780                    length);
 781           buf = new_buf;
 782         }
 783     }
 784   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 785                           (scm_t_bits) hash, SCM_UNPACK (props));
 786 }
 787
 788 SCM
 789 scm_i_c_make_symbol (const char *name, size_t len,
 790                      scm_t_bits flags, unsigned long hash, SCM props)
 791 {
 792   SCM buf = make_stringbuf (len);
 793   memcpy (STRINGBUF_CHARS (buf), name, len);
 794
 795   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 796                           (scm_t_bits) hash, SCM_UNPACK (props));
 797 }
 798
 799 /* Returns the number of characters in SYM.  This may be different
 800    from the memory size of SYM.  */
 801 size_t
 802 scm_i_symbol_length (SCM sym)
 803 {
 804   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 805 }
 806
 807 size_t
 808 scm_c_symbol_length (SCM sym)
 809 #define FUNC_NAME "scm_c_symbol_length"
 810 {
 811   SCM_VALIDATE_SYMBOL (1, sym);
 812
 813   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 814 }
 815 #undef FUNC_NAME
 816
 817 /* True if the name of SYM is stored as a Latin-1 encoded string.
 818    False if it is stored as a 32-bit UCS-4-encoded string.  */
 819 int
 820 scm_i_is_narrow_symbol (SCM sym)
 821 {
 822   SCM buf;
 823
 824   buf = SYMBOL_STRINGBUF (sym);
 825   return !STRINGBUF_WIDE (buf);
 826 }
 827
 828 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 829    contains the name of SYM.  */
 830 const char *
 831 scm_i_symbol_chars (SCM sym)
 832 {
 833   SCM buf;
 834
 835   buf = SYMBOL_STRINGBUF (sym);
 836   if (!STRINGBUF_WIDE (buf))
 837     return (const char *) STRINGBUF_CHARS (buf);
 838   else
 839     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 840                     scm_list_1 (sym));
 841 }
 842
 843 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 844    symbol's name.  */
 845 const scm_t_wchar *
 846 scm_i_symbol_wide_chars (SCM sym)
 847 {
 848   SCM buf;
 849
 850   buf = SYMBOL_STRINGBUF (sym);
 851   if (STRINGBUF_WIDE (buf))
 852     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 853   else
 854     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 855                     scm_list_1 (sym));
 856 }
 857
 858 SCM
 859 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 860 {
 861   SCM buf = SYMBOL_STRINGBUF (sym);
 862   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 863   SET_STRINGBUF_SHARED (buf);
 864   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 865   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 866                           (scm_t_bits)start, (scm_t_bits) end - start);
 867 }
 868
 869 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 870 scm_t_wchar
 871 scm_i_symbol_ref (SCM sym, size_t x)
 872 {
 873   if (scm_i_is_narrow_symbol (sym))
 874     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 875   else
 876     return scm_i_symbol_wide_chars (sym)[x];
 877 }
 878
 879 /* Debugging
 880  */
 881
 882 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 883             "Returns an association list containing debugging information\n"
 884             "for @var{str}. The association list has the following entries."
 885             "@table @code\n"
 886             "@item string\n"
 887             "The string itself.\n"
 888             "@item start\n"
 889             "The start index of the string into its stringbuf\n"
 890             "@item length\n"
 891             "The length of the string\n"
 892             "@item shared\n"
 893             "If this string is a substring, it returns its parent string.\n"
 894             "Otherwise, it returns @code{#f}\n"
 895             "@item read-only\n"
 896             "@code{#t} if the string is read-only\n"
 897             "@item stringbuf-chars\n"
 898             "A new string containing this string's stringbuf's characters\n"
 899             "@item stringbuf-length\n"
 900             "The number of characters in this stringbuf\n"
 901             "@item stringbuf-shared\n"
 902             "@code{#t} if this stringbuf is shared\n"
 903             "@item stringbuf-wide\n"
 904             "@code{#t} if this stringbuf's characters are stored in a\n"
 905             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 906             "buffer\n"
 907             "@end table")
 908 #define FUNC_NAME s_scm_sys_string_dump
 909 {
 910   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 911   SCM buf;
 912   SCM_VALIDATE_STRING (1, str);
 913
 914   /* String info */
 915   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 916                  str);
 917   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 918                  scm_from_size_t (STRING_START (str)));
 919   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 920                  scm_from_size_t (STRING_LENGTH (str)));
 921
 922   if (IS_SH_STRING (str))
 923     {
 924       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 925                      SH_STRING_STRING (str));
 926       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 927     }
 928   else
 929     {
 930       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 931                      SCM_BOOL_F);
 932       buf = STRING_STRINGBUF (str);
 933     }
 934
 935   if (IS_RO_STRING (str))
 936     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 937                    SCM_BOOL_T);
 938   else
 939     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 940                    SCM_BOOL_F);
 941
 942   /* Stringbuf info */
 943   if (!STRINGBUF_WIDE (buf))
 944     {
 945       size_t len = STRINGBUF_LENGTH (buf);
 946       char *cbuf;
 947       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 948       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 949       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 950                      sbc);
 951     }
 952   else
 953     {
 954       size_t len = STRINGBUF_LENGTH (buf);
 955       scm_t_wchar *cbuf;
 956       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 957       u32_cpy ((scm_t_uint32 *) cbuf,
 958                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 959       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 960                      sbc);
 961     }
 962   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 963                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 964   if (STRINGBUF_SHARED (buf))
 965     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 966                    SCM_BOOL_T);
 967   else
 968     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 969                    SCM_BOOL_F);
 970   if (STRINGBUF_WIDE (buf))
 971     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 972                    SCM_BOOL_T);
 973   else
 974     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 975                    SCM_BOOL_F);
 976
 977   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 978 }
 979 #undef FUNC_NAME
 980
 981 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 982             "Returns an association list containing debugging information\n"
 983             "for @var{sym}. The association list has the following entries."
 984             "@table @code\n"
 985             "@item symbol\n"
 986             "The symbol itself\n"
 987             "@item hash\n"
 988             "Its hash value\n"
 989             "@item interned\n"
 990             "@code{#t} if it is an interned symbol\n"
 991             "@item stringbuf-chars\n"
 992             "A new string containing this symbols's stringbuf's characters\n"
 993             "@item stringbuf-length\n"
 994             "The number of characters in this stringbuf\n"
 995             "@item stringbuf-shared\n"
 996             "@code{#t} if this stringbuf is shared\n"
 997             "@item stringbuf-wide\n"
 998             "@code{#t} if this stringbuf's characters are stored in a\n"
 999             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1000             "buffer\n"
1001             "@end table")
1002 #define FUNC_NAME s_scm_sys_symbol_dump
1003 {
1004   SCM e1, e2, e3, e4, e5, e6, e7;
1005   SCM buf;
1006   SCM_VALIDATE_SYMBOL (1, sym);
1007   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1008                  sym);
1009   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1010                  scm_from_ulong (scm_i_symbol_hash (sym)));
1011   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1012                  scm_symbol_interned_p (sym));
1013   buf = SYMBOL_STRINGBUF (sym);
1014
1015   /* Stringbuf info */
1016   if (!STRINGBUF_WIDE (buf))
1017     {
1018       size_t len = STRINGBUF_LENGTH (buf);
1019       char *cbuf;
1020       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1021       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1022       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1023                      sbc);
1024     }
1025   else
1026     {
1027       size_t len = STRINGBUF_LENGTH (buf);
1028       scm_t_wchar *cbuf;
1029       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1030       u32_cpy ((scm_t_uint32 *) cbuf,
1031                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1032       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1033                      sbc);
1034     }
1035   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1036                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1037   if (STRINGBUF_SHARED (buf))
1038     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1039                    SCM_BOOL_T);
1040   else
1041     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1042                    SCM_BOOL_F);
1043   if (STRINGBUF_WIDE (buf))
1044     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1045                     SCM_BOOL_T);
1046   else
1047     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1048                     SCM_BOOL_F);
1049   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1050
1051 }
1052 #undef FUNC_NAME
1053
1054 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1055
1056 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1057 #define FUNC_NAME s_scm_sys_stringbuf_hist
1058 {
1059   int i;
1060   for (i = 0; i < 1000; i++)
1061     if (lenhist[i])
1062       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1063   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1064   return SCM_UNSPECIFIED;
1065 }
1066 #undef FUNC_NAME
1067
1068 #endif
1069
1070 \f
1071
1072 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1073             (SCM obj),
1074             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1075 #define FUNC_NAME s_scm_string_p
1076 {
1077   return scm_from_bool (IS_STRING (obj));
1078 }
1079 #undef FUNC_NAME
1080
1081
1082 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1083
1084 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1085             (SCM chrs),
1086             "@deffnx {Scheme Procedure} list->string chrs\n"
1087             "Return a newly allocated string composed of the arguments,\n"
1088             "@var{chrs}.")
1089 #define FUNC_NAME s_scm_string
1090 {
1091   SCM result = SCM_BOOL_F;
1092   SCM rest;
1093   size_t len;
1094   size_t p = 0;
1095   long i;
1096   int wide = 0;
1097
1098   /* Verify that this is a list of chars.  */
1099   i = scm_ilength (chrs);
1100   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1101
1102   len = (size_t) i;
1103   rest = chrs;
1104
1105   while (len > 0 && scm_is_pair (rest))
1106     {
1107       SCM elt = SCM_CAR (rest);
1108       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1109       if (SCM_CHAR (elt) > 0xFF)
1110         wide = 1;
1111       rest = SCM_CDR (rest);
1112       len--;
1113       scm_remember_upto_here_1 (elt);
1114     }
1115
1116   /* Construct a string containing this list of chars.  */
1117   len = (size_t) i;
1118   rest = chrs;
1119
1120   if (wide == 0)
1121     {
1122       char *buf;
1123
1124       result = scm_i_make_string (len, NULL, 0);
1125       result = scm_i_string_start_writing (result);
1126       buf = scm_i_string_writable_chars (result);
1127       while (len > 0 && scm_is_pair (rest))
1128         {
1129           SCM elt = SCM_CAR (rest);
1130           buf[p] = (unsigned char) SCM_CHAR (elt);
1131           p++;
1132           rest = SCM_CDR (rest);
1133           len--;
1134           scm_remember_upto_here_1 (elt);
1135         }
1136     }
1137   else
1138     {
1139       scm_t_wchar *buf;
1140
1141       result = scm_i_make_wide_string (len, NULL, 0);
1142       result = scm_i_string_start_writing (result);
1143       buf = scm_i_string_writable_wide_chars (result);
1144       while (len > 0 && scm_is_pair (rest))
1145         {
1146           SCM elt = SCM_CAR (rest);
1147           buf[p] = SCM_CHAR (elt);
1148           p++;
1149           rest = SCM_CDR (rest);
1150           len--;
1151           scm_remember_upto_here_1 (elt);
1152         }
1153     }
1154   scm_i_string_stop_writing ();
1155
1156   if (len > 0)
1157     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1158   if (!scm_is_null (rest))
1159     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1160
1161   return result;
1162 }
1163 #undef FUNC_NAME
1164
1165 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1166             (SCM k, SCM chr),
1167             "Return a newly allocated string of\n"
1168             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1169             "the string are initialized to @var{chr}, otherwise the contents\n"
1170             "of the string are all set to @code{#\nul}.")
1171 #define FUNC_NAME s_scm_make_string
1172 {
1173   return scm_c_make_string (scm_to_size_t (k), chr);
1174 }
1175 #undef FUNC_NAME
1176
1177 SCM
1178 scm_c_make_string (size_t len, SCM chr)
1179 #define FUNC_NAME NULL
1180 {
1181   size_t p;
1182   char *contents = NULL;
1183   SCM res = scm_i_make_string (len, &contents, 0);
1184
1185   /* If no char is given, initialize string contents to NULL.  */
1186   if (SCM_UNBNDP (chr))
1187     memset (contents, 0, len);
1188   else
1189     {
1190       SCM_VALIDATE_CHAR (0, chr);
1191       res = scm_i_string_start_writing (res);
1192       for (p = 0; p < len; p++)
1193         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1194       scm_i_string_stop_writing ();
1195     }
1196
1197   return res;
1198 }
1199 #undef FUNC_NAME
1200
1201 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1202             (SCM string),
1203             "Return the number of characters in @var{string}.")
1204 #define FUNC_NAME s_scm_string_length
1205 {
1206   SCM_VALIDATE_STRING (1, string);
1207   return scm_from_size_t (STRING_LENGTH (string));
1208 }
1209 #undef FUNC_NAME
1210
1211 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1212             (SCM string),
1213             "Return the bytes used to represent a character in @var{string}."
1214             "This will return 1 or 4.")
1215 #define FUNC_NAME s_scm_string_bytes_per_char
1216 {
1217   SCM_VALIDATE_STRING (1, string);
1218   if (!scm_i_is_narrow_string (string))
1219     return scm_from_int (4);
1220
1221   return scm_from_int (1);
1222 }
1223 #undef FUNC_NAME
1224
1225 size_t
1226 scm_c_string_length (SCM string)
1227 {
1228   if (!IS_STRING (string))
1229     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1230   return STRING_LENGTH (string);
1231 }
1232
1233 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1234             (SCM str, SCM k),
1235             "Return character @var{k} of @var{str} using zero-origin\n"
1236             "indexing. @var{k} must be a valid index of @var{str}.")
1237 #define FUNC_NAME s_scm_string_ref
1238 {
1239   size_t len;
1240   unsigned long idx;
1241
1242   SCM_VALIDATE_STRING (1, str);
1243
1244   len = scm_i_string_length (str);
1245   if (SCM_LIKELY (len > 0))
1246     idx = scm_to_unsigned_integer (k, 0, len - 1);
1247   else
1248     scm_out_of_range (NULL, k);
1249
1250   if (scm_i_is_narrow_string (str))
1251     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1252   else
1253     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1254 }
1255 #undef FUNC_NAME
1256
1257 SCM
1258 scm_c_string_ref (SCM str, size_t p)
1259 {
1260   if (p >= scm_i_string_length (str))
1261     scm_out_of_range (NULL, scm_from_size_t (p));
1262   if (scm_i_is_narrow_string (str))
1263     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1264   else
1265     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1266
1267 }
1268
1269 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1270             (SCM str, SCM k, SCM chr),
1271             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1272             "an unspecified value. @var{k} must be a valid index of\n"
1273             "@var{str}.")
1274 #define FUNC_NAME s_scm_string_set_x
1275 {
1276   size_t len;
1277   unsigned long idx;
1278
1279   SCM_VALIDATE_STRING (1, str);
1280
1281   len = scm_i_string_length (str);
1282   if (SCM_LIKELY (len > 0))
1283     idx = scm_to_unsigned_integer (k, 0, len - 1);
1284   else
1285     scm_out_of_range (NULL, k);
1286
1287   SCM_VALIDATE_CHAR (3, chr);
1288   str = scm_i_string_start_writing (str);
1289   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1290   scm_i_string_stop_writing ();
1291
1292   return SCM_UNSPECIFIED;
1293 }
1294 #undef FUNC_NAME
1295
1296 void
1297 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1298 {
1299   if (p >= scm_i_string_length (str))
1300     scm_out_of_range (NULL, scm_from_size_t (p));
1301   str = scm_i_string_start_writing (str);
1302   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1303   scm_i_string_stop_writing ();
1304 }
1305
1306 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1307             (SCM str, SCM start, SCM end),
1308             "Return a newly allocated string formed from the characters\n"
1309             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1310             "ending with index @var{end} (exclusive).\n"
1311             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1312             "exact integers satisfying:\n\n"
1313             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1314 #define FUNC_NAME s_scm_substring
1315 {
1316   size_t len, from, to;
1317
1318   SCM_VALIDATE_STRING (1, str);
1319   len = scm_i_string_length (str);
1320   from = scm_to_unsigned_integer (start, 0, len);
1321   if (SCM_UNBNDP (end))
1322     to = len;
1323   else
1324     to = scm_to_unsigned_integer (end, from, len);
1325   return scm_i_substring (str, from, to);
1326 }
1327 #undef FUNC_NAME
1328
1329 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1330             (SCM str, SCM start, SCM end),
1331             "Return a newly allocated string formed from the characters\n"
1332             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1333             "ending with index @var{end} (exclusive).\n"
1334             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1335             "exact integers satisfying:\n"
1336             "\n"
1337             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1338             "\n"
1339             "The returned string is read-only.\n")
1340 #define FUNC_NAME s_scm_substring_read_only
1341 {
1342   size_t len, from, to;
1343
1344   SCM_VALIDATE_STRING (1, str);
1345   len = scm_i_string_length (str);
1346   from = scm_to_unsigned_integer (start, 0, len);
1347   if (SCM_UNBNDP (end))
1348     to = len;
1349   else
1350     to = scm_to_unsigned_integer (end, from, len);
1351   return scm_i_substring_read_only (str, from, to);
1352 }
1353 #undef FUNC_NAME
1354
1355 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1356             (SCM str, SCM start, SCM end),
1357             "Return a newly allocated string formed from the characters\n"
1358             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1359             "ending with index @var{end} (exclusive).\n"
1360             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1361             "exact integers satisfying:\n\n"
1362             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1363 #define FUNC_NAME s_scm_substring_copy
1364 {
1365   /* For the Scheme version, START is mandatory, but for the C
1366      version, it is optional.  See scm_string_copy in srfi-13.c for a
1367      rationale.
1368   */
1369
1370   size_t from, to;
1371
1372   SCM_VALIDATE_STRING (1, str);
1373   scm_i_get_substring_spec (scm_i_string_length (str),
1374                             start, &from, end, &to);
1375   return scm_i_substring_copy (str, from, to);
1376 }
1377 #undef FUNC_NAME
1378
1379 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1380             (SCM str, SCM start, SCM end),
1381             "Return string that indirectly refers to the characters\n"
1382             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1383             "ending with index @var{end} (exclusive).\n"
1384             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1385             "exact integers satisfying:\n\n"
1386             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1387 #define FUNC_NAME s_scm_substring_shared
1388 {
1389   size_t len, from, to;
1390
1391   SCM_VALIDATE_STRING (1, str);
1392   len = scm_i_string_length (str);
1393   from = scm_to_unsigned_integer (start, 0, len);
1394   if (SCM_UNBNDP (end))
1395     to = len;
1396   else
1397     to = scm_to_unsigned_integer (end, from, len);
1398   return scm_i_substring_shared (str, from, to);
1399 }
1400 #undef FUNC_NAME
1401
1402 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1403             (SCM args),
1404             "Return a newly allocated string whose characters form the\n"
1405             "concatenation of the given strings, @var{args}.")
1406 #define FUNC_NAME s_scm_string_append
1407 {
1408   SCM res;
1409   size_t total = 0;
1410   size_t len;
1411   int wide = 0;
1412   SCM l, s;
1413   size_t i;
1414   union
1415   {
1416     char *narrow;
1417     scm_t_wchar *wide;
1418   } data;
1419
1420   SCM_VALIDATE_REST_ARGUMENT (args);
1421   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1422     {
1423       s = SCM_CAR (l);
1424       SCM_VALIDATE_STRING (SCM_ARGn, s);
1425       len = scm_i_string_length (s);
1426       if (((size_t) -1) - total < len)
1427         scm_num_overflow (s_scm_string_append);
1428       total += len;
1429       if (!scm_i_is_narrow_string (s))
1430         wide = 1;
1431     }
1432   data.narrow = NULL;
1433   if (!wide)
1434     res = scm_i_make_string (total, &data.narrow, 0);
1435   else
1436     res = scm_i_make_wide_string (total, &data.wide, 0);
1437
1438   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1439     {
1440       size_t len;
1441       s = SCM_CAR (l);
1442       SCM_VALIDATE_STRING (SCM_ARGn, s);
1443       len = scm_i_string_length (s);
1444       if (len > total)
1445         SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1446       if (!wide)
1447         {
1448           memcpy (data.narrow, scm_i_string_chars (s), len);
1449           data.narrow += len;
1450         }
1451       else
1452         {
1453           if (scm_i_is_narrow_string (s))
1454             {
1455               const char *src = scm_i_string_chars (s);
1456               for (i = 0; i < len; i++)
1457                 data.wide[i] = (unsigned char) src[i];
1458             }
1459           else
1460             u32_cpy ((scm_t_uint32 *) data.wide,
1461                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1462           data.wide += len;
1463         }
1464       total -= len;
1465       scm_remember_upto_here_1 (s);
1466     }
1467   if (total != 0)
1468     SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1469   return res;
1470 }
1471 #undef FUNC_NAME
1472
1473
1474 \f
1475 /* Charset conversion error handling.  */
1476
1477 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1478 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1479
1480 /* Raise an exception informing that character CHR could not be written
1481    to PORT in its current encoding.  */
1482 void
1483 scm_encoding_error (const char *subr, int err, const char *message,
1484                     SCM port, SCM chr)
1485 {
1486   scm_throw (scm_encoding_error_key,
1487              scm_list_n (scm_from_latin1_string (subr),
1488                          scm_from_latin1_string (message),
1489                          scm_from_int (err),
1490                          port, chr,
1491                          SCM_UNDEFINED));
1492 }
1493
1494 /* Raise an exception informing of an encoding error on PORT.  This
1495    means that a character could not be written in PORT's encoding.  */
1496 void
1497 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1498 {
1499   scm_throw (scm_decoding_error_key,
1500              scm_list_n (scm_from_latin1_string (subr),
1501                          scm_from_latin1_string (message),
1502                          scm_from_int (err),
1503                          port,
1504                          SCM_UNDEFINED));
1505 }
1506
1507 \f
1508 /* String conversion to/from C.  */
1509
1510 SCM
1511 scm_from_stringn (const char *str, size_t len, const char *encoding,
1512                   scm_t_string_failed_conversion_handler handler)
1513 {
1514   size_t u32len, i;
1515   scm_t_wchar *u32;
1516   int wide = 0;
1517   SCM res;
1518
1519   /* The order of these checks is important. */
1520   if (!str && len != 0)
1521     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1522   if (len == (size_t) -1)
1523     len = strlen (str);
1524
1525   if (encoding == NULL || len == 0)
1526     {
1527       /* If encoding is null (or the string is empty), use Latin-1.  */
1528       char *buf;
1529       res = scm_i_make_string (len, &buf, 0);
1530       memcpy (buf, str, len);
1531       return res;
1532     }
1533
1534   u32len = 0;
1535   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1536                                                 (enum iconv_ilseq_handler)
1537                                                 handler,
1538                                                 str, len,
1539                                                 NULL,
1540                                                 NULL, &u32len);
1541
1542   if (SCM_UNLIKELY (u32 == NULL))
1543     {
1544       /* Raise an error and pass the raw C string as a bytevector to the `throw'
1545          handler.  */
1546       SCM bv;
1547       signed char *buf;
1548
1549       buf = scm_gc_malloc_pointerless (len, "bytevector");
1550       memcpy (buf, str, len);
1551       bv = scm_c_take_gc_bytevector (buf, len);
1552
1553       scm_decoding_error (__func__, errno,
1554                           "input locale conversion error", bv);
1555     }
1556
1557   i = 0;
1558   while (i < u32len)
1559     if (u32[i++] > 0xFF)
1560       {
1561         wide = 1;
1562         break;
1563       }
1564
1565   if (!wide)
1566     {
1567       char *dst;
1568       res = scm_i_make_string (u32len, &dst, 0);
1569       for (i = 0; i < u32len; i ++)
1570         dst[i] = (unsigned char) u32[i];
1571       dst[u32len] = '\0';
1572     }
1573   else
1574     {
1575       scm_t_wchar *wdst;
1576       res = scm_i_make_wide_string (u32len, &wdst, 0);
1577       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1578       wdst[u32len] = 0;
1579     }
1580
1581   free (u32);
1582   return res;
1583 }
1584
1585 SCM
1586 scm_from_locale_string (const char *str)
1587 {
1588   return scm_from_locale_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_locale_stringn (const char *str, size_t len)
1593 {
1594   return scm_from_stringn (str, len, locale_charset (),
1595                            scm_i_default_port_conversion_handler ());
1596 }
1597
1598 SCM
1599 scm_from_latin1_string (const char *str)
1600 {
1601   return scm_from_latin1_stringn (str, -1);
1602 }
1603
1604 SCM
1605 scm_from_latin1_stringn (const char *str, size_t len)
1606 {
1607   char *buf;
1608   SCM result;
1609
1610   if (len == (size_t) -1)
1611     len = strlen (str);
1612
1613   /* Make a narrow string and copy STR as is.  */
1614   result = scm_i_make_string (len, &buf, 0);
1615   memcpy (buf, str, len);
1616
1617   return result;
1618 }
1619
1620 SCM
1621 scm_from_utf8_string (const char *str)
1622 {
1623   return scm_from_utf8_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_utf8_stringn (const char *str, size_t len)
1628 {
1629   return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1630 }
1631
1632 SCM
1633 scm_from_utf32_string (const scm_t_wchar *str)
1634 {
1635   return scm_from_utf32_stringn (str, -1);
1636 }
1637
1638 SCM
1639 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1640 {
1641   SCM result;
1642   scm_t_wchar *buf;
1643
1644   if (len == (size_t) -1)
1645     len = u32_strlen ((uint32_t *) str);
1646
1647   result = scm_i_make_wide_string (len, &buf, 0);
1648   memcpy (buf, str, len * sizeof (scm_t_wchar));
1649   scm_i_try_narrow_string (result);
1650
1651   return result;
1652 }
1653
1654 /* Create a new scheme string from the C string STR.  The memory of
1655    STR may be used directly as storage for the new string.  */
1656 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1657    would be to register a finalizer to eventually free(3) STR, which isn't
1658    worth it.  Should we just deprecate the `scm_take_' functions?  */
1659 SCM
1660 scm_take_locale_stringn (char *str, size_t len)
1661 {
1662   SCM res;
1663
1664   res = scm_from_locale_stringn (str, len);
1665   free (str);
1666
1667   return res;
1668 }
1669
1670 SCM
1671 scm_take_locale_string (char *str)
1672 {
1673   return scm_take_locale_stringn (str, -1);
1674 }
1675
1676 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1677    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1678    Set *LENP to the size of the resulting string.
1679
1680    FIXME: This is a hack we should get rid of.  See
1681    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1682    for details.  */
1683 static void
1684 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1685 {
1686   char *before, *after;
1687   size_t i, j;
1688
1689   before = buf;
1690   after = buf;
1691   i = 0;
1692   j = 0;
1693   while (i < *lenp)
1694     {
1695       if ((i <= *lenp - 6)
1696           && before[i] == '\\'
1697           && before[i + 1] == 'u'
1698           && before[i + 2] == '0' && before[i + 3] == '0')
1699         {
1700           /* Convert \u00NN to \xNN */
1701           after[j] = '\\';
1702           after[j + 1] = 'x';
1703           after[j + 2] = tolower ((int) before[i + 4]);
1704           after[j + 3] = tolower ((int) before[i + 5]);
1705           i += 6;
1706           j += 4;
1707         }
1708       else if ((i <= *lenp - 10)
1709                && before[i] == '\\'
1710                && before[i + 1] == 'U'
1711                && before[i + 2] == '0' && before[i + 3] == '0')
1712         {
1713           /* Convert \U00NNNNNN to \UNNNNNN */
1714           after[j] = '\\';
1715           after[j + 1] = 'U';
1716           after[j + 2] = tolower ((int) before[i + 4]);
1717           after[j + 3] = tolower ((int) before[i + 5]);
1718           after[j + 4] = tolower ((int) before[i + 6]);
1719           after[j + 5] = tolower ((int) before[i + 7]);
1720           after[j + 6] = tolower ((int) before[i + 8]);
1721           after[j + 7] = tolower ((int) before[i + 9]);
1722           i += 10;
1723           j += 8;
1724         }
1725       else
1726         {
1727           after[j] = before[i];
1728           i++;
1729           j++;
1730         }
1731     }
1732   *lenp = j;
1733 }
1734
1735 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1736    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1737    of the resulting string.  BUF must be large enough to handle the
1738    worst case when `\uXXXX' escapes (6 characters) are replaced by
1739    `\xXXXX;' (7 characters).  */
1740 static void
1741 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1742 {
1743   char *before, *after;
1744   size_t i, j;
1745   /* The worst case is if the input string contains all 4-digit hex escapes.
1746      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1747   size_t max_out_len = (*lenp * 7) / 6 + 1;
1748   size_t nzeros, ndigits;
1749
1750   before = buf;
1751   after = alloca (max_out_len);
1752   i = 0;
1753   j = 0;
1754   while (i < *lenp)
1755     {
1756       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1757           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1758         {
1759           if (before[i + 1] == 'u')
1760             ndigits = 4;
1761           else if (before[i + 1] == 'U')
1762             ndigits = 8;
1763           else
1764             abort ();
1765
1766           /* Add the R6RS hex escape initial sequence.  */
1767           after[j] = '\\';
1768           after[j + 1] = 'x';
1769
1770           /* Move string positions to the start of the hex numbers.  */
1771           i += 2;
1772           j += 2;
1773
1774           /* Find the number of initial zeros in this hex number.  */
1775           nzeros = 0;
1776           while (before[i + nzeros] == '0' && nzeros < ndigits)
1777             nzeros++;
1778
1779           /* Copy the number, skipping initial zeros, and then move the string
1780              positions.  */
1781           if (nzeros == ndigits)
1782             {
1783               after[j] = '0';
1784               i += ndigits;
1785               j += 1;
1786             }
1787           else
1788             {
1789               int pos;
1790               for (pos = 0; pos < ndigits - nzeros; pos++)
1791                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1792               i += ndigits;
1793               j += (ndigits - nzeros);
1794             }
1795
1796           /* Add terminating semicolon.  */
1797           after[j] = ';';
1798           j++;
1799         }
1800       else
1801         {
1802           after[j] = before[i];
1803           i++;
1804           j++;
1805         }
1806     }
1807   *lenp = j;
1808   memcpy (before, after, j);
1809 }
1810
1811 char *
1812 scm_to_locale_string (SCM str)
1813 {
1814   return scm_to_locale_stringn (str, NULL);
1815 }
1816
1817 char *
1818 scm_to_locale_stringn (SCM str, size_t *lenp)
1819 {
1820   return scm_to_stringn (str, lenp,
1821                          locale_charset (),
1822                          scm_i_default_port_conversion_handler ());
1823 }
1824
1825 char *
1826 scm_to_latin1_string (SCM str)
1827 {
1828   return scm_to_latin1_stringn (str, NULL);
1829 }
1830
1831 char *
1832 scm_to_latin1_stringn (SCM str, size_t *lenp)
1833 #define FUNC_NAME "scm_to_latin1_stringn"
1834 {
1835   char *result;
1836
1837   SCM_VALIDATE_STRING (1, str);
1838
1839   if (scm_i_is_narrow_string (str))
1840     {
1841       size_t len = scm_i_string_length (str);
1842
1843       if (lenp)
1844         *lenp = len;
1845
1846       result = scm_strndup (scm_i_string_data (str), len);
1847     }
1848   else
1849     result = scm_to_stringn (str, lenp, NULL,
1850                              SCM_FAILED_CONVERSION_ERROR);
1851
1852   return result;
1853 }
1854 #undef FUNC_NAME
1855
1856 char *
1857 scm_to_utf8_string (SCM str)
1858 {
1859   return scm_to_utf8_stringn (str, NULL);
1860 }
1861
1862 static size_t
1863 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1864 {
1865   size_t ret, i;
1866   for (i = 0, ret = 0; i < len; i++)
1867     ret += (str[i] < 128) ? 1 : 2;
1868   return ret;
1869 }
1870
1871 static scm_t_uint8*
1872 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1873               scm_t_uint8 *u8_result, size_t *u8_lenp)
1874 {
1875   size_t i, n;
1876   size_t u8_len = latin1_u8_strlen (str, latin_len);
1877
1878   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1879     u8_result = scm_malloc (u8_len + 1);
1880   if (u8_lenp)
1881     *u8_lenp = u8_len;
1882
1883   for (i = 0, n = 0; i < latin_len; i++)
1884     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1885   if (n != u8_len)
1886     abort ();
1887   u8_result[n] = 0;
1888
1889   return u8_result;
1890 }
1891
1892 /* UTF-8 code table
1893
1894    (Note that this includes code points that are not allowed by Unicode,
1895     but since this function has no way to report an error, and its
1896     purpose is to determine the size of destination buffers for
1897     libunicode conversion functions, we err on the safe side and handle
1898     everything that libunicode might conceivably handle, now or in the
1899     future.)
1900
1901    Char. number range  |        UTF-8 octet sequence
1902       (hexadecimal)    |              (binary)
1903    --------------------+------------------------------------------------------
1904    0000 0000-0000 007F | 0xxxxxxx
1905    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
1906    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1907    0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1908    0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1909    0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1910 */
1911
1912 static size_t
1913 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
1914 {
1915   size_t ret, i;
1916
1917   for (i = 0, ret = 0; i < len; i++)
1918     {
1919       scm_t_uint32 c = str[i];
1920
1921       if (c <= 0x7f)
1922         ret += 1;
1923       else if (c <= 0x7ff)
1924         ret += 2;
1925       else if (c <= 0xffff)
1926         ret += 3;
1927       else if (c <= 0x1fffff)
1928         ret += 4;
1929       else if (c <= 0x3ffffff)
1930         ret += 5;
1931       else
1932         ret += 6;
1933     }
1934
1935   return ret;
1936 }
1937
1938 char *
1939 scm_to_utf8_stringn (SCM str, size_t *lenp)
1940 #define FUNC_NAME "scm_to_utf8_stringn"
1941 {
1942   SCM_VALIDATE_STRING (1, str);
1943
1944   if (scm_i_is_narrow_string (str))
1945     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
1946                                   scm_i_string_length (str),
1947                                   NULL, lenp);
1948   else
1949     {
1950       scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
1951       scm_t_uint8 *buf, *ret;
1952       size_t num_chars = scm_i_string_length (str);
1953       size_t num_bytes_predicted, num_bytes_actual;
1954
1955       num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
1956
1957       if (lenp)
1958         {
1959           *lenp = num_bytes_predicted;
1960           buf = scm_malloc (num_bytes_predicted);
1961         }
1962       else
1963         {
1964           buf = scm_malloc (num_bytes_predicted + 1);
1965           buf[num_bytes_predicted] = 0;
1966         }
1967
1968       num_bytes_actual = num_bytes_predicted;
1969       ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
1970
1971       if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
1972         return (char *) ret;
1973
1974       /* An error: a bad codepoint.  */
1975       {
1976         int saved_errno = errno;
1977
1978         free (buf);
1979         if (!saved_errno)
1980           abort ();
1981
1982         scm_decoding_error ("scm_to_utf8_stringn", errno,
1983                             "invalid codepoint in string", str);
1984
1985         /* Not reached.  */
1986         return NULL;
1987       }
1988     }
1989 }
1990 #undef FUNC_NAME
1991
1992 scm_t_wchar *
1993 scm_to_utf32_string (SCM str)
1994 {
1995   return scm_to_utf32_stringn (str, NULL);
1996 }
1997
1998 scm_t_wchar *
1999 scm_to_utf32_stringn (SCM str, size_t *lenp)
2000 #define FUNC_NAME "scm_to_utf32_stringn"
2001 {
2002   scm_t_wchar *result;
2003
2004   SCM_VALIDATE_STRING (1, str);
2005
2006   if (scm_i_is_narrow_string (str))
2007     {
2008       scm_t_uint8 *codepoints;
2009       size_t i, len;
2010
2011       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2012       len = scm_i_string_length (str);
2013       if (lenp)
2014         *lenp = len;
2015
2016       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2017       for (i = 0; i < len; i++)
2018         result[i] = codepoints[i];
2019       result[len] = 0;
2020     }
2021   else
2022     {
2023       size_t len;
2024
2025       len = scm_i_string_length (str);
2026       if (lenp)
2027         *lenp = len;
2028
2029       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2030       memcpy (result, scm_i_string_wide_chars (str),
2031               len * sizeof (scm_t_wchar));
2032       result[len] = 0;
2033     }
2034
2035   return result;
2036 }
2037 #undef FUNC_NAME
2038
2039 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2040    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2041    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2042    defined by HANDLER.  */
2043 char *
2044 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2045                 scm_t_string_failed_conversion_handler handler)
2046 {
2047   char *buf;
2048   size_t ilen, len, i;
2049   int ret;
2050   const char *enc;
2051
2052   if (!scm_is_string (str))
2053     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2054   ilen = scm_i_string_length (str);
2055
2056   if (ilen == 0)
2057     {
2058       buf = scm_malloc (1);
2059       buf[0] = '\0';
2060       if (lenp)
2061         *lenp = 0;
2062       return buf;
2063     }
2064
2065   if (lenp == NULL)
2066     for (i = 0; i < ilen; i++)
2067       if (scm_i_string_ref (str, i) == '\0')
2068         scm_misc_error (NULL,
2069                         "string contains #\\nul character: ~S",
2070                         scm_list_1 (str));
2071
2072   if (scm_i_is_narrow_string (str) && (encoding == NULL))
2073     {
2074       /* If using native Latin-1 encoding, just copy the string
2075          contents.  */
2076       if (lenp)
2077         {
2078           buf = scm_malloc (ilen);
2079           memcpy (buf, scm_i_string_chars (str), ilen);
2080           *lenp = ilen;
2081           return buf;
2082         }
2083       else
2084         {
2085           buf = scm_malloc (ilen + 1);
2086           memcpy (buf, scm_i_string_chars (str), ilen);
2087           buf[ilen] = '\0';
2088           return buf;
2089         }
2090     }
2091
2092
2093   buf = NULL;
2094   len = 0;
2095   enc = encoding;
2096   if (enc == NULL)
2097     enc = "ISO-8859-1";
2098   if (scm_i_is_narrow_string (str))
2099     {
2100       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2101                          "ISO-8859-1", enc,
2102                          (enum iconv_ilseq_handler) handler, NULL,
2103                          &buf, &len);
2104
2105       if (ret != 0)
2106         scm_encoding_error (__func__, errno,
2107                             "cannot convert narrow string to output locale",
2108                             SCM_BOOL_F,
2109                             /* FIXME: Faulty character unknown.  */
2110                             SCM_BOOL_F);
2111     }
2112   else
2113     {
2114       buf = u32_conv_to_encoding (enc,
2115                                   (enum iconv_ilseq_handler) handler,
2116                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2117                                   ilen,
2118                                   NULL,
2119                                   NULL, &len);
2120       if (buf == NULL)
2121         scm_encoding_error (__func__, errno,
2122                             "cannot convert wide string to output locale",
2123                             SCM_BOOL_F,
2124                             /* FIXME: Faulty character unknown.  */
2125                             SCM_BOOL_F);
2126     }
2127   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2128     {
2129       if (SCM_R6RS_ESCAPES_P)
2130         {
2131           /* The worst case is if the input string contains all 4-digit
2132              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2133              (seven characters).  Make BUF large enough to hold
2134              that.  */
2135           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2136           unistring_escapes_to_r6rs_escapes (buf, &len);
2137         }
2138       else
2139         unistring_escapes_to_guile_escapes (buf, &len);
2140
2141       buf = scm_realloc (buf, len);
2142     }
2143   if (lenp)
2144     *lenp = len;
2145   else
2146     {
2147       buf = scm_realloc (buf, len + 1);
2148       buf[len] = '\0';
2149     }
2150
2151   scm_remember_upto_here_1 (str);
2152   return buf;
2153 }
2154
2155 size_t
2156 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2157 {
2158   size_t len;
2159   char *result = NULL;
2160   if (!scm_is_string (str))
2161     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2162   result = scm_to_locale_stringn (str, &len);
2163
2164   memcpy (buf, result, (len > max_len) ? max_len : len);
2165   free (result);
2166
2167   scm_remember_upto_here_1 (str);
2168   return len;
2169 }
2170
2171 \f
2172 /* Unicode string normalization.  */
2173
2174 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2175    libguile/i18n.c.  It would be useful to have this factored out into a more
2176    convenient location, but its use of alloca makes that tricky to do. */
2177
2178 static SCM
2179 normalize_str (SCM string, uninorm_t form)
2180 {
2181   SCM ret;
2182   scm_t_uint32 *w_str;
2183   scm_t_wchar *cbuf;
2184   size_t rlen, len = scm_i_string_length (string);
2185
2186   if (scm_i_is_narrow_string (string))
2187     {
2188       size_t i;
2189       const char *buf = scm_i_string_chars (string);
2190
2191       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2192
2193       for (i = 0; i < len; i ++)
2194         w_str[i] = (unsigned char) buf[i];
2195       w_str[len] = 0;
2196     }
2197   else
2198     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2199
2200   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2201
2202   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2203   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2204   free (w_str);
2205
2206   scm_i_try_narrow_string (ret);
2207
2208   return ret;
2209 }
2210
2211 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2212             (SCM string),
2213             "Returns the NFC normalized form of @var{string}.")
2214 #define FUNC_NAME s_scm_string_normalize_nfc
2215 {
2216   SCM_VALIDATE_STRING (1, string);
2217   return normalize_str (string, UNINORM_NFC);
2218 }
2219 #undef FUNC_NAME
2220
2221 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2222             (SCM string),
2223             "Returns the NFD normalized form of @var{string}.")
2224 #define FUNC_NAME s_scm_string_normalize_nfd
2225 {
2226   SCM_VALIDATE_STRING (1, string);
2227   return normalize_str (string, UNINORM_NFD);
2228 }
2229 #undef FUNC_NAME
2230
2231 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2232             (SCM string),
2233             "Returns the NFKC normalized form of @var{string}.")
2234 #define FUNC_NAME s_scm_string_normalize_nfkc
2235 {
2236   SCM_VALIDATE_STRING (1, string);
2237   return normalize_str (string, UNINORM_NFKC);
2238 }
2239 #undef FUNC_NAME
2240
2241 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2242             (SCM string),
2243             "Returns the NFKD normalized form of @var{string}.")
2244 #define FUNC_NAME s_scm_string_normalize_nfkd
2245 {
2246   SCM_VALIDATE_STRING (1, string);
2247   return normalize_str (string, UNINORM_NFKD);
2248 }
2249 #undef FUNC_NAME
2250
2251 /* converts C scm_array of strings to SCM scm_list of strings.
2252    If argc < 0, a null terminated scm_array is assumed.
2253    The current locale encoding is assumed */
2254 SCM
2255 scm_makfromstrs (int argc, char **argv)
2256 {
2257   int i = argc;
2258   SCM lst = SCM_EOL;
2259   if (0 > i)
2260     for (i = 0; argv[i]; i++);
2261   while (i--)
2262     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2263   return lst;
2264 }
2265
2266 /* Return a newly allocated array of char pointers to each of the strings
2267    in args, with a terminating NULL pointer.  The strings are encoded using
2268    the current locale. */
2269
2270 char **
2271 scm_i_allocate_string_pointers (SCM list)
2272 #define FUNC_NAME "scm_i_allocate_string_pointers"
2273 {
2274   char **result;
2275   int list_len = scm_ilength (list);
2276   int i;
2277
2278   if (list_len < 0)
2279     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2280
2281   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2282                           "string pointers");
2283   result[list_len] = NULL;
2284
2285   /* The list might have been modified in another thread, so
2286      we check LIST before each access.
2287    */
2288   for (i = 0; i < list_len && scm_is_pair (list); i++)
2289     {
2290       SCM str = SCM_CAR (list);
2291       size_t len;  /* String length in bytes */
2292       char *c_str = scm_to_locale_stringn (str, &len);
2293
2294       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2295          scm_malloc to allocate the returned string, which must be
2296          explicitly deallocated.  This forces us to copy the string a
2297          second time into a new buffer.  Ideally there would be variants
2298          of scm_to_*_stringn that can return garbage-collected buffers. */
2299
2300       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2301       memcpy (result[i], c_str, len);
2302       result[i][len] = '\0';
2303       free (c_str);
2304
2305       list = SCM_CDR (list);
2306     }
2307
2308   return result;
2309 }
2310 #undef FUNC_NAME
2311
2312 void
2313 scm_i_get_substring_spec (size_t len,
2314                           SCM start, size_t *cstart,
2315                           SCM end, size_t *cend)
2316 {
2317   if (SCM_UNBNDP (start))
2318     *cstart = 0;
2319   else
2320     *cstart = scm_to_unsigned_integer (start, 0, len);
2321
2322   if (SCM_UNBNDP (end))
2323     *cend = len;
2324   else
2325     *cend = scm_to_unsigned_integer (end, *cstart, len);
2326 }
2327
2328 #if SCM_ENABLE_DEPRECATED
2329
2330 /* When these definitions are removed, it becomes reasonable to use
2331    read-only strings for string literals.  For that, change the reader
2332    to create string literals with scm_c_substring_read_only instead of
2333    with scm_c_substring_copy.
2334 */
2335
2336 int
2337 scm_i_deprecated_stringp (SCM str)
2338 {
2339   scm_c_issue_deprecation_warning
2340     ("SCM_STRINGP is deprecated.  Use scm_is_string instead.");
2341
2342   return scm_is_string (str);
2343 }
2344
2345 char *
2346 scm_i_deprecated_string_chars (SCM str)
2347 {
2348   char *chars;
2349
2350   scm_c_issue_deprecation_warning
2351     ("SCM_STRING_CHARS is deprecated.  See the manual for alternatives.");
2352
2353   /* We don't accept shared substrings here since they are not
2354      null-terminated.
2355   */
2356   if (IS_SH_STRING (str))
2357     scm_misc_error (NULL,
2358                     "SCM_STRING_CHARS does not work with shared substrings",
2359                     SCM_EOL);
2360
2361   /* We explicitly test for read-only strings to produce a better
2362      error message.
2363   */
2364
2365   if (IS_RO_STRING (str))
2366     scm_misc_error (NULL,
2367                     "SCM_STRING_CHARS does not work with read-only strings",
2368                     SCM_EOL);
2369
2370   /* The following is still wrong, of course...
2371    */
2372   str = scm_i_string_start_writing (str);
2373   chars = scm_i_string_writable_chars (str);
2374   scm_i_string_stop_writing ();
2375   return chars;
2376 }
2377
2378 size_t
2379 scm_i_deprecated_string_length (SCM str)
2380 {
2381   scm_c_issue_deprecation_warning
2382     ("SCM_STRING_LENGTH is deprecated.  Use scm_c_string_length instead.");
2383   return scm_c_string_length (str);
2384 }
2385
2386 #endif
2387
2388 static SCM
2389 string_handle_ref (scm_t_array_handle *h, size_t index)
2390 {
2391   return scm_c_string_ref (h->array, index);
2392 }
2393
2394 static void
2395 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2396 {
2397   scm_c_string_set_x (h->array, index, val);
2398 }
2399
2400 static void
2401 string_get_handle (SCM v, scm_t_array_handle *h)
2402 {
2403   h->array = v;
2404   h->ndims = 1;
2405   h->dims = &h->dim0;
2406   h->dim0.lbnd = 0;
2407   h->dim0.ubnd = scm_c_string_length (v) - 1;
2408   h->dim0.inc = 1;
2409   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2410   h->elements = h->writable_elements = NULL;
2411 }
2412
2413 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2414                           string_handle_ref, string_handle_set,
2415                           string_get_handle)
2416 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2417
2418 void
2419 scm_init_strings ()
2420 {
2421   scm_nullstr = scm_i_make_string (0, NULL, 0);
2422
2423 #include "libguile/strings.x"
2424 }
2425
2426
2427 /*
2428   Local Variables:
2429   c-file-style: "gnu"
2430   End:
2431 */