libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 \f
 229
 230 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 231 SCM_PTHREAD_ATFORK_LOCK_STATIC_MUTEX (stringbuf_write_mutex);
 232
 233 static scm_i_pthread_mutex_t iconv_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 234
 235 void
 236 scm_i_lock_iconv (void)
 237 {
 238   scm_i_pthread_mutex_lock (&iconv_mutex);
 239 }
 240
 241 void
 242 scm_i_unlock_iconv (void)
 243 {
 244   scm_i_pthread_mutex_unlock (&iconv_mutex);
 245 }
 246
 247 \f
 248 /* Copy-on-write strings.
 249  */
 250
 251 #define STRING_TAG            scm_tc7_string
 252
 253 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 254 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 255 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 256
 257 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 258 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 259
 260 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 261
 262 /* Read-only strings.
 263  */
 264
 265 #define RO_STRING_TAG         scm_tc7_ro_string
 266 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 267
 268 /* Mutation-sharing substrings
 269  */
 270
 271 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 272
 273 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 274 /* START and LENGTH as for STRINGs. */
 275
 276 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 277
 278 SCM scm_nullstr;
 279
 280 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 281    characters.  CHARSP, if not NULL, will be set to location of the
 282    char array.  If READ_ONLY_P, the returned string is read-only;
 283    otherwise it is writable.  */
 284 SCM
 285 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 286 {
 287   static SCM null_stringbuf = SCM_BOOL_F;
 288   SCM buf;
 289   SCM res;
 290
 291   if (len == 0)
 292     {
 293       if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
 294         {
 295           null_stringbuf = make_stringbuf (0);
 296           SET_STRINGBUF_SHARED (null_stringbuf);
 297         }
 298       buf = null_stringbuf;
 299     }
 300   else
 301     buf = make_stringbuf (len);
 302
 303   if (charsp)
 304     *charsp = (char *) STRINGBUF_CHARS (buf);
 305   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 306                          SCM_UNPACK (buf),
 307                          (scm_t_bits) 0, (scm_t_bits) len);
 308   return res;
 309 }
 310
 311 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 312    characters.  CHARSP, if not NULL, will be set to location of the
 313    character array.  If READ_ONLY_P, the returned string is read-only;
 314    otherwise it is writable.  */
 315 SCM
 316 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 317 {
 318   SCM buf = make_wide_stringbuf (len);
 319   SCM res;
 320   if (charsp)
 321     *charsp = STRINGBUF_WIDE_CHARS (buf);
 322   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 323                          SCM_UNPACK (buf),
 324                          (scm_t_bits) 0, (scm_t_bits) len);
 325   return res;
 326 }
 327
 328 static void
 329 validate_substring_args (SCM str, size_t start, size_t end)
 330 {
 331   if (!IS_STRING (str))
 332     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 333   if (start > STRING_LENGTH (str))
 334     scm_out_of_range (NULL, scm_from_size_t (start));
 335   if (end > STRING_LENGTH (str) || end < start)
 336     scm_out_of_range (NULL, scm_from_size_t (end));
 337 }
 338
 339 static inline void
 340 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 341 {
 342   *start = STRING_START (*str);
 343   if (IS_SH_STRING (*str))
 344     {
 345       *str = SH_STRING_STRING (*str);
 346       *start += STRING_START (*str);
 347     }
 348   *buf = STRING_STRINGBUF (*str);
 349 }
 350
 351 SCM
 352 scm_i_substring (SCM str, size_t start, size_t end)
 353 {
 354   if (start == end)
 355     return scm_i_make_string (0, NULL, 0);
 356   else
 357     {
 358       SCM buf;
 359       size_t str_start;
 360       get_str_buf_start (&str, &buf, &str_start);
 361       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 362       SET_STRINGBUF_SHARED (buf);
 363       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 364       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 365                               (scm_t_bits)str_start + start,
 366                               (scm_t_bits) end - start);
 367     }
 368 }
 369
 370 SCM
 371 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 372 {
 373   if (start == end)
 374     return scm_i_make_string (0, NULL, 1);
 375   else
 376     {
 377       SCM buf;
 378       size_t str_start;
 379       get_str_buf_start (&str, &buf, &str_start);
 380       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 381       SET_STRINGBUF_SHARED (buf);
 382       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 383       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 384                               (scm_t_bits)str_start + start,
 385                               (scm_t_bits) end - start);
 386     }
 387 }
 388
 389 SCM
 390 scm_i_substring_copy (SCM str, size_t start, size_t end)
 391 {
 392   if (start == end)
 393     return scm_i_make_string (0, NULL, 0);
 394   else
 395     {
 396       size_t len = end - start;
 397       SCM buf, my_buf, substr;
 398       size_t str_start;
 399       int wide = 0;
 400       get_str_buf_start (&str, &buf, &str_start);
 401       if (scm_i_is_narrow_string (str))
 402         {
 403           my_buf = make_stringbuf (len);
 404           memcpy (STRINGBUF_CHARS (my_buf),
 405                   STRINGBUF_CHARS (buf) + str_start + start, len);
 406         }
 407       else
 408         {
 409           my_buf = make_wide_stringbuf (len);
 410           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 411                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 412                                      + start), len);
 413           wide = 1;
 414         }
 415       scm_remember_upto_here_1 (buf);
 416       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 417                                 (scm_t_bits) 0, (scm_t_bits) len);
 418       if (wide)
 419         scm_i_try_narrow_string (substr);
 420       return substr;
 421     }
 422 }
 423
 424 SCM
 425 scm_i_substring_shared (SCM str, size_t start, size_t end)
 426 {
 427   if (start == 0 && end == STRING_LENGTH (str))
 428     return str;
 429   else if (start == end)
 430     return scm_i_make_string (0, NULL, 0);
 431   else
 432     {
 433       size_t len = end - start;
 434       if (IS_SH_STRING (str))
 435         {
 436           start += STRING_START (str);
 437           str = SH_STRING_STRING (str);
 438         }
 439       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 440                               (scm_t_bits)start, (scm_t_bits) len);
 441     }
 442 }
 443
 444 SCM
 445 scm_c_substring (SCM str, size_t start, size_t end)
 446 {
 447   validate_substring_args (str, start, end);
 448   return scm_i_substring (str, start, end);
 449 }
 450
 451 SCM
 452 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 453 {
 454   validate_substring_args (str, start, end);
 455   return scm_i_substring_read_only (str, start, end);
 456 }
 457
 458 SCM
 459 scm_c_substring_copy (SCM str, size_t start, size_t end)
 460 {
 461   validate_substring_args (str, start, end);
 462   return scm_i_substring_copy (str, start, end);
 463 }
 464
 465 SCM
 466 scm_c_substring_shared (SCM str, size_t start, size_t end)
 467 {
 468   validate_substring_args (str, start, end);
 469   return scm_i_substring_shared (str, start, end);
 470 }
 471
 472 \f
 473 /* Internal accessors
 474  */
 475
 476 /* Returns the number of characters in STR.  This may be different
 477    than the memory size of the string storage.  */
 478 size_t
 479 scm_i_string_length (SCM str)
 480 {
 481   return STRING_LENGTH (str);
 482 }
 483
 484 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 485    encoding.  False if it is 'wide', having a 32-bit UCS-4
 486    encoding.  */
 487 int
 488 scm_i_is_narrow_string (SCM str)
 489 {
 490   if (IS_SH_STRING (str))
 491     str = SH_STRING_STRING (str);
 492
 493   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 494 }
 495
 496 /* Try to coerce a string to be narrow.  It if is narrow already, do
 497    nothing.  If it is wide, shrink it to narrow if none of its
 498    characters are above 0xFF.  Return true if the string is narrow or
 499    was made to be narrow.  */
 500 int
 501 scm_i_try_narrow_string (SCM str)
 502 {
 503   if (IS_SH_STRING (str))
 504     str = SH_STRING_STRING (str);
 505
 506   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 507
 508   return scm_i_is_narrow_string (str);
 509 }
 510
 511 /* Return a pointer to the raw data of the string, which can be either Latin-1
 512    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 513 const void *
 514 scm_i_string_data (SCM str)
 515 {
 516   SCM buf;
 517   size_t start;
 518   const char *data;
 519
 520   get_str_buf_start (&str, &buf, &start);
 521
 522   data = STRINGBUF_CONTENTS (buf);
 523   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 524
 525   return data;
 526 }
 527
 528 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 529    STR.  */
 530 const char *
 531 scm_i_string_chars (SCM str)
 532 {
 533   SCM buf;
 534   size_t start;
 535   get_str_buf_start (&str, &buf, &start);
 536   if (scm_i_is_narrow_string (str))
 537     return (const char *) STRINGBUF_CHARS (buf) + start;
 538   else
 539     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 540                     scm_list_1 (str));
 541   return NULL;
 542 }
 543
 544 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 545    STR.  */
 546 const scm_t_wchar *
 547 scm_i_string_wide_chars (SCM str)
 548 {
 549   SCM buf;
 550   size_t start;
 551
 552   get_str_buf_start (&str, &buf, &start);
 553   if (!scm_i_is_narrow_string (str))
 554     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 555   else
 556     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 557                     scm_list_1 (str));
 558 }
 559
 560 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 561    a new string buffer, so that it can be modified without modifying
 562    other strings.  Also, lock the string mutex.  Later, one must call
 563    scm_i_string_stop_writing to unlock the mutex.  */
 564 SCM
 565 scm_i_string_start_writing (SCM orig_str)
 566 {
 567   SCM buf, str = orig_str;
 568   size_t start;
 569
 570   get_str_buf_start (&str, &buf, &start);
 571   if (IS_RO_STRING (str))
 572     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 573
 574   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 575   if (STRINGBUF_SHARED (buf))
 576     {
 577       /* Clone the stringbuf.  */
 578       size_t len = STRING_LENGTH (str);
 579       SCM new_buf;
 580
 581       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 582
 583       if (scm_i_is_narrow_string (str))
 584         {
 585           new_buf = make_stringbuf (len);
 586           memcpy (STRINGBUF_CHARS (new_buf),
 587                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 588
 589         }
 590       else
 591         {
 592           new_buf = make_wide_stringbuf (len);
 593           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 594                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 595                                      + STRING_START (str)), len);
 596         }
 597
 598       SET_STRING_STRINGBUF (str, new_buf);
 599       start -= STRING_START (str);
 600
 601       /* FIXME: The following operations are not atomic, so other threads
 602          looking at STR may see an inconsistent state.  Nevertheless it can't
 603          hurt much since (i) accessing STR while it is being mutated can't
 604          yield a crash, and (ii) concurrent accesses to STR should be
 605          protected by a mutex at the application level.  The latter may not
 606          apply when STR != ORIG_STR, though.  */
 607       SET_STRING_START (str, 0);
 608       SET_STRING_STRINGBUF (str, new_buf);
 609
 610       buf = new_buf;
 611
 612       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 613     }
 614   return orig_str;
 615 }
 616
 617 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 618 char *
 619 scm_i_string_writable_chars (SCM str)
 620 {
 621   SCM buf;
 622   size_t start;
 623
 624   get_str_buf_start (&str, &buf, &start);
 625   if (scm_i_is_narrow_string (str))
 626     return (char *) STRINGBUF_CHARS (buf) + start;
 627   else
 628     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 629                     scm_list_1 (str));
 630   return NULL;
 631 }
 632
 633 /* Return a pointer to the UCS-4 codepoints of a string.  */
 634 static scm_t_wchar *
 635 scm_i_string_writable_wide_chars (SCM str)
 636 {
 637   SCM buf;
 638   size_t start;
 639
 640   get_str_buf_start (&str, &buf, &start);
 641   if (!scm_i_is_narrow_string (str))
 642     return STRINGBUF_WIDE_CHARS (buf) + start;
 643   else
 644     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 645                     scm_list_1 (str));
 646 }
 647
 648 /* Unlock the string mutex that was locked when
 649    scm_i_string_start_writing was called.  */
 650 void
 651 scm_i_string_stop_writing (void)
 652 {
 653   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 654 }
 655
 656 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 657 scm_t_wchar
 658 scm_i_string_ref (SCM str, size_t x)
 659 {
 660   if (scm_i_is_narrow_string (str))
 661     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 662   else
 663     return scm_i_string_wide_chars (str)[x];
 664 }
 665
 666 /* Returns index+1 of the first char in STR that matches C, or
 667    0 if the char is not found.  */
 668 int
 669 scm_i_string_contains_char (SCM str, char ch)
 670 {
 671   size_t i;
 672   size_t len = scm_i_string_length (str);
 673
 674   i = 0;
 675   if (scm_i_is_narrow_string (str))
 676     {
 677       while (i < len)
 678         {
 679           if (scm_i_string_chars (str)[i] == ch)
 680             return i+1;
 681           i++;
 682         }
 683     }
 684   else
 685     {
 686       while (i < len)
 687         {
 688           if (scm_i_string_wide_chars (str)[i]
 689               == (unsigned char) ch)
 690             return i+1;
 691           i++;
 692         }
 693     }
 694   return 0;
 695 }
 696
 697 int
 698 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 699 {
 700   if (scm_i_is_narrow_string (sstr))
 701     {
 702       const char *a = scm_i_string_chars (sstr) + start_x;
 703       const char *b = cstr;
 704       return strncmp (a, b, strlen(b));
 705     }
 706   else
 707     {
 708       size_t i;
 709       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 710       const char *b = cstr;
 711       for (i = 0; i < strlen (b); i++)
 712         {
 713           if (a[i] != (unsigned char) b[i])
 714             return 1;
 715         }
 716     }
 717   return 0;
 718 }
 719
 720 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 721 void
 722 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 723 {
 724   if (IS_SH_STRING (str))
 725     {
 726       p += STRING_START (str);
 727       str = SH_STRING_STRING (str);
 728     }
 729
 730   if (chr > 0xFF && scm_i_is_narrow_string (str))
 731     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 732
 733   if (scm_i_is_narrow_string (str))
 734     {
 735       char *dst = scm_i_string_writable_chars (str);
 736       dst[p] = chr;
 737     }
 738   else
 739     {
 740       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 741       dst[p] = chr;
 742     }
 743 }
 744
 745 \f
 746 /* Symbols.
 747
 748    Basic symbol creation and accessing is done here, the rest is in
 749    symbols.[hc].  This has been done to keep stringbufs and the
 750    internals of strings and string-like objects confined to this file.
 751 */
 752
 753 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 754
 755 SCM
 756 scm_i_make_symbol (SCM name, scm_t_bits flags,
 757                    unsigned long hash, SCM props)
 758 {
 759   SCM buf;
 760   size_t start = STRING_START (name);
 761   size_t length = STRING_LENGTH (name);
 762
 763   if (IS_SH_STRING (name))
 764     {
 765       name = SH_STRING_STRING (name);
 766       start += STRING_START (name);
 767     }
 768   buf = SYMBOL_STRINGBUF (name);
 769
 770   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 771     {
 772       /* reuse buf. */
 773       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 774       SET_STRINGBUF_SHARED (buf);
 775       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 776     }
 777   else
 778     {
 779       /* make new buf. */
 780       if (scm_i_is_narrow_string (name))
 781         {
 782           SCM new_buf = make_stringbuf (length);
 783           memcpy (STRINGBUF_CHARS (new_buf),
 784                   STRINGBUF_CHARS (buf) + start, length);
 785           buf = new_buf;
 786         }
 787       else
 788         {
 789           SCM new_buf = make_wide_stringbuf (length);
 790           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 791                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 792                    length);
 793           buf = new_buf;
 794         }
 795     }
 796   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 797                           (scm_t_bits) hash, SCM_UNPACK (props));
 798 }
 799
 800 SCM
 801 scm_i_c_make_symbol (const char *name, size_t len,
 802                      scm_t_bits flags, unsigned long hash, SCM props)
 803 {
 804   SCM buf = make_stringbuf (len);
 805   memcpy (STRINGBUF_CHARS (buf), name, len);
 806
 807   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 808                           (scm_t_bits) hash, SCM_UNPACK (props));
 809 }
 810
 811 /* Returns the number of characters in SYM.  This may be different
 812    from the memory size of SYM.  */
 813 size_t
 814 scm_i_symbol_length (SCM sym)
 815 {
 816   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 817 }
 818
 819 size_t
 820 scm_c_symbol_length (SCM sym)
 821 #define FUNC_NAME "scm_c_symbol_length"
 822 {
 823   SCM_VALIDATE_SYMBOL (1, sym);
 824
 825   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 826 }
 827 #undef FUNC_NAME
 828
 829 /* True if the name of SYM is stored as a Latin-1 encoded string.
 830    False if it is stored as a 32-bit UCS-4-encoded string.  */
 831 int
 832 scm_i_is_narrow_symbol (SCM sym)
 833 {
 834   SCM buf;
 835
 836   buf = SYMBOL_STRINGBUF (sym);
 837   return !STRINGBUF_WIDE (buf);
 838 }
 839
 840 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 841    contains the name of SYM.  */
 842 const char *
 843 scm_i_symbol_chars (SCM sym)
 844 {
 845   SCM buf;
 846
 847   buf = SYMBOL_STRINGBUF (sym);
 848   if (!STRINGBUF_WIDE (buf))
 849     return (const char *) STRINGBUF_CHARS (buf);
 850   else
 851     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 852                     scm_list_1 (sym));
 853 }
 854
 855 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 856    symbol's name.  */
 857 const scm_t_wchar *
 858 scm_i_symbol_wide_chars (SCM sym)
 859 {
 860   SCM buf;
 861
 862   buf = SYMBOL_STRINGBUF (sym);
 863   if (STRINGBUF_WIDE (buf))
 864     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 865   else
 866     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 867                     scm_list_1 (sym));
 868 }
 869
 870 SCM
 871 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 872 {
 873   SCM buf = SYMBOL_STRINGBUF (sym);
 874   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 875   SET_STRINGBUF_SHARED (buf);
 876   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 877   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 878                           (scm_t_bits)start, (scm_t_bits) end - start);
 879 }
 880
 881 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 882 scm_t_wchar
 883 scm_i_symbol_ref (SCM sym, size_t x)
 884 {
 885   if (scm_i_is_narrow_symbol (sym))
 886     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 887   else
 888     return scm_i_symbol_wide_chars (sym)[x];
 889 }
 890
 891 /* Debugging
 892  */
 893
 894 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 895             "Returns an association list containing debugging information\n"
 896             "for @var{str}. The association list has the following entries."
 897             "@table @code\n"
 898             "@item string\n"
 899             "The string itself.\n"
 900             "@item start\n"
 901             "The start index of the string into its stringbuf\n"
 902             "@item length\n"
 903             "The length of the string\n"
 904             "@item shared\n"
 905             "If this string is a substring, it returns its parent string.\n"
 906             "Otherwise, it returns @code{#f}\n"
 907             "@item read-only\n"
 908             "@code{#t} if the string is read-only\n"
 909             "@item stringbuf-chars\n"
 910             "A new string containing this string's stringbuf's characters\n"
 911             "@item stringbuf-length\n"
 912             "The number of characters in this stringbuf\n"
 913             "@item stringbuf-shared\n"
 914             "@code{#t} if this stringbuf is shared\n"
 915             "@item stringbuf-wide\n"
 916             "@code{#t} if this stringbuf's characters are stored in a\n"
 917             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 918             "buffer\n"
 919             "@end table")
 920 #define FUNC_NAME s_scm_sys_string_dump
 921 {
 922   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 923   SCM buf;
 924   SCM_VALIDATE_STRING (1, str);
 925
 926   /* String info */
 927   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 928                  str);
 929   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 930                  scm_from_size_t (STRING_START (str)));
 931   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 932                  scm_from_size_t (STRING_LENGTH (str)));
 933
 934   if (IS_SH_STRING (str))
 935     {
 936       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 937                      SH_STRING_STRING (str));
 938       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 939     }
 940   else
 941     {
 942       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 943                      SCM_BOOL_F);
 944       buf = STRING_STRINGBUF (str);
 945     }
 946
 947   if (IS_RO_STRING (str))
 948     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 949                    SCM_BOOL_T);
 950   else
 951     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 952                    SCM_BOOL_F);
 953
 954   /* Stringbuf info */
 955   if (!STRINGBUF_WIDE (buf))
 956     {
 957       size_t len = STRINGBUF_LENGTH (buf);
 958       char *cbuf;
 959       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 960       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 961       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 962                      sbc);
 963     }
 964   else
 965     {
 966       size_t len = STRINGBUF_LENGTH (buf);
 967       scm_t_wchar *cbuf;
 968       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 969       u32_cpy ((scm_t_uint32 *) cbuf,
 970                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 971       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 972                      sbc);
 973     }
 974   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 975                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 976   if (STRINGBUF_SHARED (buf))
 977     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 978                    SCM_BOOL_T);
 979   else
 980     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 981                    SCM_BOOL_F);
 982   if (STRINGBUF_WIDE (buf))
 983     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 984                    SCM_BOOL_T);
 985   else
 986     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 987                    SCM_BOOL_F);
 988
 989   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 990 }
 991 #undef FUNC_NAME
 992
 993 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 994             "Returns an association list containing debugging information\n"
 995             "for @var{sym}. The association list has the following entries."
 996             "@table @code\n"
 997             "@item symbol\n"
 998             "The symbol itself\n"
 999             "@item hash\n"
1000             "Its hash value\n"
1001             "@item interned\n"
1002             "@code{#t} if it is an interned symbol\n"
1003             "@item stringbuf-chars\n"
1004             "A new string containing this symbols's stringbuf's characters\n"
1005             "@item stringbuf-length\n"
1006             "The number of characters in this stringbuf\n"
1007             "@item stringbuf-shared\n"
1008             "@code{#t} if this stringbuf is shared\n"
1009             "@item stringbuf-wide\n"
1010             "@code{#t} if this stringbuf's characters are stored in a\n"
1011             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1012             "buffer\n"
1013             "@end table")
1014 #define FUNC_NAME s_scm_sys_symbol_dump
1015 {
1016   SCM e1, e2, e3, e4, e5, e6, e7;
1017   SCM buf;
1018   SCM_VALIDATE_SYMBOL (1, sym);
1019   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1020                  sym);
1021   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1022                  scm_from_ulong (scm_i_symbol_hash (sym)));
1023   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1024                  scm_symbol_interned_p (sym));
1025   buf = SYMBOL_STRINGBUF (sym);
1026
1027   /* Stringbuf info */
1028   if (!STRINGBUF_WIDE (buf))
1029     {
1030       size_t len = STRINGBUF_LENGTH (buf);
1031       char *cbuf;
1032       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1033       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1034       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1035                      sbc);
1036     }
1037   else
1038     {
1039       size_t len = STRINGBUF_LENGTH (buf);
1040       scm_t_wchar *cbuf;
1041       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1042       u32_cpy ((scm_t_uint32 *) cbuf,
1043                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1044       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1045                      sbc);
1046     }
1047   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1048                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1049   if (STRINGBUF_SHARED (buf))
1050     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1051                    SCM_BOOL_T);
1052   else
1053     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1054                    SCM_BOOL_F);
1055   if (STRINGBUF_WIDE (buf))
1056     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1057                     SCM_BOOL_T);
1058   else
1059     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1060                     SCM_BOOL_F);
1061   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1062
1063 }
1064 #undef FUNC_NAME
1065
1066 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1067
1068 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1069 #define FUNC_NAME s_scm_sys_stringbuf_hist
1070 {
1071   int i;
1072   for (i = 0; i < 1000; i++)
1073     if (lenhist[i])
1074       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1075   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1076   return SCM_UNSPECIFIED;
1077 }
1078 #undef FUNC_NAME
1079
1080 #endif
1081
1082 \f
1083
1084 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1085             (SCM obj),
1086             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1087 #define FUNC_NAME s_scm_string_p
1088 {
1089   return scm_from_bool (IS_STRING (obj));
1090 }
1091 #undef FUNC_NAME
1092
1093
1094 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1095
1096 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1097             (SCM chrs),
1098             "@deffnx {Scheme Procedure} list->string chrs\n"
1099             "Return a newly allocated string composed of the arguments,\n"
1100             "@var{chrs}.")
1101 #define FUNC_NAME s_scm_string
1102 {
1103   SCM result = SCM_BOOL_F;
1104   SCM rest;
1105   size_t len;
1106   size_t p = 0;
1107   long i;
1108   int wide = 0;
1109
1110   /* Verify that this is a list of chars.  */
1111   i = scm_ilength (chrs);
1112   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1113
1114   len = (size_t) i;
1115   rest = chrs;
1116
1117   while (len > 0 && scm_is_pair (rest))
1118     {
1119       SCM elt = SCM_CAR (rest);
1120       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1121       if (SCM_CHAR (elt) > 0xFF)
1122         wide = 1;
1123       rest = SCM_CDR (rest);
1124       len--;
1125       scm_remember_upto_here_1 (elt);
1126     }
1127
1128   /* Construct a string containing this list of chars.  */
1129   len = (size_t) i;
1130   rest = chrs;
1131
1132   if (wide == 0)
1133     {
1134       char *buf;
1135
1136       result = scm_i_make_string (len, NULL, 0);
1137       result = scm_i_string_start_writing (result);
1138       buf = scm_i_string_writable_chars (result);
1139       while (len > 0 && scm_is_pair (rest))
1140         {
1141           SCM elt = SCM_CAR (rest);
1142           buf[p] = (unsigned char) SCM_CHAR (elt);
1143           p++;
1144           rest = SCM_CDR (rest);
1145           len--;
1146           scm_remember_upto_here_1 (elt);
1147         }
1148     }
1149   else
1150     {
1151       scm_t_wchar *buf;
1152
1153       result = scm_i_make_wide_string (len, NULL, 0);
1154       result = scm_i_string_start_writing (result);
1155       buf = scm_i_string_writable_wide_chars (result);
1156       while (len > 0 && scm_is_pair (rest))
1157         {
1158           SCM elt = SCM_CAR (rest);
1159           buf[p] = SCM_CHAR (elt);
1160           p++;
1161           rest = SCM_CDR (rest);
1162           len--;
1163           scm_remember_upto_here_1 (elt);
1164         }
1165     }
1166   scm_i_string_stop_writing ();
1167
1168   if (len > 0)
1169     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1170   if (!scm_is_null (rest))
1171     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1172
1173   return result;
1174 }
1175 #undef FUNC_NAME
1176
1177 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1178             (SCM k, SCM chr),
1179             "Return a newly allocated string of\n"
1180             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1181             "the string are initialized to @var{chr}, otherwise the contents\n"
1182             "of the string are all set to @code{#\nul}.")
1183 #define FUNC_NAME s_scm_make_string
1184 {
1185   return scm_c_make_string (scm_to_size_t (k), chr);
1186 }
1187 #undef FUNC_NAME
1188
1189 SCM
1190 scm_c_make_string (size_t len, SCM chr)
1191 #define FUNC_NAME NULL
1192 {
1193   size_t p;
1194   char *contents = NULL;
1195   SCM res = scm_i_make_string (len, &contents, 0);
1196
1197   /* If no char is given, initialize string contents to NULL.  */
1198   if (SCM_UNBNDP (chr))
1199     memset (contents, 0, len);
1200   else
1201     {
1202       SCM_VALIDATE_CHAR (0, chr);
1203       res = scm_i_string_start_writing (res);
1204       for (p = 0; p < len; p++)
1205         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1206       scm_i_string_stop_writing ();
1207     }
1208
1209   return res;
1210 }
1211 #undef FUNC_NAME
1212
1213 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1214             (SCM string),
1215             "Return the number of characters in @var{string}.")
1216 #define FUNC_NAME s_scm_string_length
1217 {
1218   SCM_VALIDATE_STRING (1, string);
1219   return scm_from_size_t (STRING_LENGTH (string));
1220 }
1221 #undef FUNC_NAME
1222
1223 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1224             (SCM string),
1225             "Return the bytes used to represent a character in @var{string}."
1226             "This will return 1 or 4.")
1227 #define FUNC_NAME s_scm_string_bytes_per_char
1228 {
1229   SCM_VALIDATE_STRING (1, string);
1230   if (!scm_i_is_narrow_string (string))
1231     return scm_from_int (4);
1232
1233   return scm_from_int (1);
1234 }
1235 #undef FUNC_NAME
1236
1237 size_t
1238 scm_c_string_length (SCM string)
1239 {
1240   if (!IS_STRING (string))
1241     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1242   return STRING_LENGTH (string);
1243 }
1244
1245 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1246             (SCM str, SCM k),
1247             "Return character @var{k} of @var{str} using zero-origin\n"
1248             "indexing. @var{k} must be a valid index of @var{str}.")
1249 #define FUNC_NAME s_scm_string_ref
1250 {
1251   size_t len;
1252   unsigned long idx;
1253
1254   SCM_VALIDATE_STRING (1, str);
1255
1256   len = scm_i_string_length (str);
1257   if (SCM_LIKELY (len > 0))
1258     idx = scm_to_unsigned_integer (k, 0, len - 1);
1259   else
1260     scm_out_of_range (NULL, k);
1261
1262   if (scm_i_is_narrow_string (str))
1263     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1264   else
1265     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1266 }
1267 #undef FUNC_NAME
1268
1269 SCM
1270 scm_c_string_ref (SCM str, size_t p)
1271 {
1272   if (p >= scm_i_string_length (str))
1273     scm_out_of_range (NULL, scm_from_size_t (p));
1274   if (scm_i_is_narrow_string (str))
1275     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1276   else
1277     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1278
1279 }
1280
1281 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1282             (SCM str, SCM k, SCM chr),
1283             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1284             "an unspecified value. @var{k} must be a valid index of\n"
1285             "@var{str}.")
1286 #define FUNC_NAME s_scm_string_set_x
1287 {
1288   size_t len;
1289   unsigned long idx;
1290
1291   SCM_VALIDATE_STRING (1, str);
1292
1293   len = scm_i_string_length (str);
1294   if (SCM_LIKELY (len > 0))
1295     idx = scm_to_unsigned_integer (k, 0, len - 1);
1296   else
1297     scm_out_of_range (NULL, k);
1298
1299   SCM_VALIDATE_CHAR (3, chr);
1300   str = scm_i_string_start_writing (str);
1301   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1302   scm_i_string_stop_writing ();
1303
1304   return SCM_UNSPECIFIED;
1305 }
1306 #undef FUNC_NAME
1307
1308 void
1309 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1310 {
1311   if (p >= scm_i_string_length (str))
1312     scm_out_of_range (NULL, scm_from_size_t (p));
1313   str = scm_i_string_start_writing (str);
1314   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1315   scm_i_string_stop_writing ();
1316 }
1317
1318 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1319             (SCM str, SCM start, SCM end),
1320             "Return a newly allocated string formed from the characters\n"
1321             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1322             "ending with index @var{end} (exclusive).\n"
1323             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1324             "exact integers satisfying:\n\n"
1325             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1326 #define FUNC_NAME s_scm_substring
1327 {
1328   size_t len, from, to;
1329
1330   SCM_VALIDATE_STRING (1, str);
1331   len = scm_i_string_length (str);
1332   from = scm_to_unsigned_integer (start, 0, len);
1333   if (SCM_UNBNDP (end))
1334     to = len;
1335   else
1336     to = scm_to_unsigned_integer (end, from, len);
1337   return scm_i_substring (str, from, to);
1338 }
1339 #undef FUNC_NAME
1340
1341 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1342             (SCM str, SCM start, SCM end),
1343             "Return a newly allocated string formed from the characters\n"
1344             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1345             "ending with index @var{end} (exclusive).\n"
1346             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1347             "exact integers satisfying:\n"
1348             "\n"
1349             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1350             "\n"
1351             "The returned string is read-only.\n")
1352 #define FUNC_NAME s_scm_substring_read_only
1353 {
1354   size_t len, from, to;
1355
1356   SCM_VALIDATE_STRING (1, str);
1357   len = scm_i_string_length (str);
1358   from = scm_to_unsigned_integer (start, 0, len);
1359   if (SCM_UNBNDP (end))
1360     to = len;
1361   else
1362     to = scm_to_unsigned_integer (end, from, len);
1363   return scm_i_substring_read_only (str, from, to);
1364 }
1365 #undef FUNC_NAME
1366
1367 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1368             (SCM str, SCM start, SCM end),
1369             "Return a newly allocated string formed from the characters\n"
1370             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1371             "ending with index @var{end} (exclusive).\n"
1372             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1373             "exact integers satisfying:\n\n"
1374             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1375 #define FUNC_NAME s_scm_substring_copy
1376 {
1377   /* For the Scheme version, START is mandatory, but for the C
1378      version, it is optional.  See scm_string_copy in srfi-13.c for a
1379      rationale.
1380   */
1381
1382   size_t from, to;
1383
1384   SCM_VALIDATE_STRING (1, str);
1385   scm_i_get_substring_spec (scm_i_string_length (str),
1386                             start, &from, end, &to);
1387   return scm_i_substring_copy (str, from, to);
1388 }
1389 #undef FUNC_NAME
1390
1391 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1392             (SCM str, SCM start, SCM end),
1393             "Return string that indirectly refers to the characters\n"
1394             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1395             "ending with index @var{end} (exclusive).\n"
1396             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1397             "exact integers satisfying:\n\n"
1398             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1399 #define FUNC_NAME s_scm_substring_shared
1400 {
1401   size_t len, from, to;
1402
1403   SCM_VALIDATE_STRING (1, str);
1404   len = scm_i_string_length (str);
1405   from = scm_to_unsigned_integer (start, 0, len);
1406   if (SCM_UNBNDP (end))
1407     to = len;
1408   else
1409     to = scm_to_unsigned_integer (end, from, len);
1410   return scm_i_substring_shared (str, from, to);
1411 }
1412 #undef FUNC_NAME
1413
1414 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1415             (SCM args),
1416             "Return a newly allocated string whose characters form the\n"
1417             "concatenation of the given strings, @var{args}.")
1418 #define FUNC_NAME s_scm_string_append
1419 {
1420   SCM res;
1421   size_t len = 0;
1422   int wide = 0;
1423   SCM l, s;
1424   size_t i;
1425   union
1426   {
1427     char *narrow;
1428     scm_t_wchar *wide;
1429   } data;
1430
1431   SCM_VALIDATE_REST_ARGUMENT (args);
1432   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1433     {
1434       s = SCM_CAR (l);
1435       SCM_VALIDATE_STRING (SCM_ARGn, s);
1436       len += scm_i_string_length (s);
1437       if (!scm_i_is_narrow_string (s))
1438         wide = 1;
1439     }
1440   data.narrow = NULL;
1441   if (!wide)
1442     res = scm_i_make_string (len, &data.narrow, 0);
1443   else
1444     res = scm_i_make_wide_string (len, &data.wide, 0);
1445
1446   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1447     {
1448       size_t len;
1449       s = SCM_CAR (l);
1450       SCM_VALIDATE_STRING (SCM_ARGn, s);
1451       len = scm_i_string_length (s);
1452       if (!wide)
1453         {
1454           memcpy (data.narrow, scm_i_string_chars (s), len);
1455           data.narrow += len;
1456         }
1457       else
1458         {
1459           if (scm_i_is_narrow_string (s))
1460             {
1461               for (i = 0; i < scm_i_string_length (s); i++)
1462                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1463             }
1464           else
1465             u32_cpy ((scm_t_uint32 *) data.wide,
1466                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1467           data.wide += len;
1468         }
1469       scm_remember_upto_here_1 (s);
1470     }
1471   return res;
1472 }
1473 #undef FUNC_NAME
1474
1475
1476 \f
1477 /* Charset conversion error handling.  */
1478
1479 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1480 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1481
1482 /* Raise an exception informing that character CHR could not be written
1483    to PORT in its current encoding.  */
1484 void
1485 scm_encoding_error (const char *subr, int err, const char *message,
1486                     SCM port, SCM chr)
1487 {
1488   scm_throw (scm_encoding_error_key,
1489              scm_list_n (scm_from_latin1_string (subr),
1490                          scm_from_latin1_string (message),
1491                          scm_from_int (err),
1492                          port, chr,
1493                          SCM_UNDEFINED));
1494 }
1495
1496 /* Raise an exception informing of an encoding error on PORT.  This
1497    means that a character could not be written in PORT's encoding.  */
1498 void
1499 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1500 {
1501   scm_throw (scm_decoding_error_key,
1502              scm_list_n (scm_from_latin1_string (subr),
1503                          scm_from_latin1_string (message),
1504                          scm_from_int (err),
1505                          port,
1506                          SCM_UNDEFINED));
1507 }
1508
1509 \f
1510 /* String conversion to/from C.  */
1511
1512 static void
1513 decoding_error (const char *func_name, int errno_save,
1514                 const char *str, size_t len)
1515 {
1516   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1517      handler.  */
1518   SCM bv;
1519   signed char *buf;
1520
1521   buf = scm_gc_malloc_pointerless (len, "bytevector");
1522   memcpy (buf, str, len);
1523   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1524
1525   scm_decoding_error (func_name, errno_save,
1526                       "input locale conversion error", bv);
1527 }
1528
1529 SCM
1530 scm_from_stringn (const char *str, size_t len, const char *encoding,
1531                   scm_t_string_failed_conversion_handler handler)
1532 {
1533   size_t u32len, i;
1534   scm_t_wchar *u32;
1535   int wide = 0;
1536   SCM res;
1537
1538   /* The order of these checks is important. */
1539   if (!str && len != 0)
1540     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1541   if (len == (size_t) -1)
1542     len = strlen (str);
1543
1544   if (encoding == NULL || len == 0)
1545     return scm_from_latin1_stringn (str, len);
1546   else if (strcmp (encoding, "UTF-8") == 0)
1547     return scm_from_utf8_stringn (str, len);
1548
1549   u32len = 0;
1550   scm_i_lock_iconv ();
1551   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1552                                                 (enum iconv_ilseq_handler)
1553                                                 handler,
1554                                                 str, len,
1555                                                 NULL,
1556                                                 NULL, &u32len);
1557   scm_i_unlock_iconv ();
1558
1559   if (SCM_UNLIKELY (u32 == NULL))
1560     decoding_error (__func__, errno, str, len);
1561
1562   i = 0;
1563   while (i < u32len)
1564     if (u32[i++] > 0xFF)
1565       {
1566         wide = 1;
1567         break;
1568       }
1569
1570   if (!wide)
1571     {
1572       char *dst;
1573       res = scm_i_make_string (u32len, &dst, 0);
1574       for (i = 0; i < u32len; i ++)
1575         dst[i] = (unsigned char) u32[i];
1576       dst[u32len] = '\0';
1577     }
1578   else
1579     {
1580       scm_t_wchar *wdst;
1581       res = scm_i_make_wide_string (u32len, &wdst, 0);
1582       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1583       wdst[u32len] = 0;
1584     }
1585
1586   free (u32);
1587   return res;
1588 }
1589
1590 SCM
1591 scm_from_locale_string (const char *str)
1592 {
1593   return scm_from_locale_stringn (str, -1);
1594 }
1595
1596 SCM
1597 scm_from_locale_stringn (const char *str, size_t len)
1598 {
1599   return scm_from_stringn (str, len, locale_charset (),
1600                            scm_i_get_conversion_strategy (SCM_BOOL_F));
1601 }
1602
1603 SCM
1604 scm_from_latin1_string (const char *str)
1605 {
1606   return scm_from_latin1_stringn (str, -1);
1607 }
1608
1609 SCM
1610 scm_from_latin1_stringn (const char *str, size_t len)
1611 {
1612   char *buf;
1613   SCM result;
1614
1615   if (len == (size_t) -1)
1616     len = strlen (str);
1617
1618   /* Make a narrow string and copy STR as is.  */
1619   result = scm_i_make_string (len, &buf, 0);
1620   memcpy (buf, str, len);
1621
1622   return result;
1623 }
1624
1625 SCM
1626 scm_from_utf8_string (const char *str)
1627 {
1628   return scm_from_utf8_stringn (str, -1);
1629 }
1630
1631 SCM
1632 scm_from_utf8_stringn (const char *str, size_t len)
1633 {
1634   size_t i, char_len;
1635   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1636   int ascii = 1, narrow = 1;
1637   SCM res;
1638
1639   if (len == (size_t) -1)
1640     len = strlen (str);
1641
1642   i = 0;
1643   char_len = 0;
1644
1645   while (i < len)
1646     {
1647       if (ustr[i] <= 127)
1648         {
1649           char_len++;
1650           i++;
1651         }
1652       else
1653         {
1654           ucs4_t c;
1655           int nbytes;
1656
1657           ascii = 0;
1658
1659           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1660
1661           if (nbytes < 0)
1662             /* Bad UTF-8.  */
1663             decoding_error (__func__, errno, str, len);
1664
1665           if (c > 255)
1666             narrow = 0;
1667
1668           char_len++;
1669           i += nbytes;
1670         }
1671     }
1672
1673   if (ascii)
1674     {
1675       char *dst;
1676       res = scm_i_make_string (char_len, &dst, 0);
1677       memcpy (dst, str, len);
1678     }
1679   else if (narrow)
1680     {
1681       char *dst;
1682       size_t j;
1683       ucs4_t c;
1684
1685       res = scm_i_make_string (char_len, &dst, 0);
1686
1687       for (i = 0, j = 0; i < len; j++)
1688         {
1689           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1690           dst[j] = (signed char) c;
1691         }
1692     }
1693   else
1694     {
1695       scm_t_wchar *dst;
1696       size_t j;
1697       ucs4_t c;
1698
1699       res = scm_i_make_wide_string (char_len, &dst, 0);
1700
1701       for (i = 0, j = 0; i < len; j++)
1702         {
1703           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1704           dst[j] = c;
1705         }
1706     }
1707
1708   return res;
1709 }
1710
1711 SCM
1712 scm_from_utf32_string (const scm_t_wchar *str)
1713 {
1714   return scm_from_utf32_stringn (str, -1);
1715 }
1716
1717 SCM
1718 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1719 {
1720   SCM result;
1721   scm_t_wchar *buf;
1722
1723   if (len == (size_t) -1)
1724     len = u32_strlen ((uint32_t *) str);
1725
1726   result = scm_i_make_wide_string (len, &buf, 0);
1727   memcpy (buf, str, len * sizeof (scm_t_wchar));
1728   scm_i_try_narrow_string (result);
1729
1730   return result;
1731 }
1732
1733 /* Create a new scheme string from the C string STR.  The memory of
1734    STR may be used directly as storage for the new string.  */
1735 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1736    would be to register a finalizer to eventually free(3) STR, which isn't
1737    worth it.  Should we just deprecate the `scm_take_' functions?  */
1738 SCM
1739 scm_take_locale_stringn (char *str, size_t len)
1740 {
1741   SCM res;
1742
1743   res = scm_from_locale_stringn (str, len);
1744   free (str);
1745
1746   return res;
1747 }
1748
1749 SCM
1750 scm_take_locale_string (char *str)
1751 {
1752   return scm_take_locale_stringn (str, -1);
1753 }
1754
1755 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1756    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1757    Set *LENP to the size of the resulting string.
1758
1759    FIXME: This is a hack we should get rid of.  See
1760    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1761    for details.  */
1762 static void
1763 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1764 {
1765   char *before, *after;
1766   size_t i, j;
1767
1768   before = buf;
1769   after = buf;
1770   i = 0;
1771   j = 0;
1772   while (i < *lenp)
1773     {
1774       if ((i <= *lenp - 6)
1775           && before[i] == '\\'
1776           && before[i + 1] == 'u'
1777           && before[i + 2] == '0' && before[i + 3] == '0')
1778         {
1779           /* Convert \u00NN to \xNN */
1780           after[j] = '\\';
1781           after[j + 1] = 'x';
1782           after[j + 2] = tolower ((int) before[i + 4]);
1783           after[j + 3] = tolower ((int) before[i + 5]);
1784           i += 6;
1785           j += 4;
1786         }
1787       else if ((i <= *lenp - 10)
1788                && before[i] == '\\'
1789                && before[i + 1] == 'U'
1790                && before[i + 2] == '0' && before[i + 3] == '0')
1791         {
1792           /* Convert \U00NNNNNN to \UNNNNNN */
1793           after[j] = '\\';
1794           after[j + 1] = 'U';
1795           after[j + 2] = tolower ((int) before[i + 4]);
1796           after[j + 3] = tolower ((int) before[i + 5]);
1797           after[j + 4] = tolower ((int) before[i + 6]);
1798           after[j + 5] = tolower ((int) before[i + 7]);
1799           after[j + 6] = tolower ((int) before[i + 8]);
1800           after[j + 7] = tolower ((int) before[i + 9]);
1801           i += 10;
1802           j += 8;
1803         }
1804       else
1805         {
1806           after[j] = before[i];
1807           i++;
1808           j++;
1809         }
1810     }
1811   *lenp = j;
1812 }
1813
1814 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1815    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1816    of the resulting string.  BUF must be large enough to handle the
1817    worst case when `\uXXXX' escapes (6 characters) are replaced by
1818    `\xXXXX;' (7 characters).  */
1819 static void
1820 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1821 {
1822   char *before, *after;
1823   size_t i, j;
1824   /* The worst case is if the input string contains all 4-digit hex escapes.
1825      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1826   size_t max_out_len = (*lenp * 7) / 6 + 1;
1827   size_t nzeros, ndigits;
1828
1829   before = buf;
1830   after = alloca (max_out_len);
1831   i = 0;
1832   j = 0;
1833   while (i < *lenp)
1834     {
1835       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1836           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1837         {
1838           if (before[i + 1] == 'u')
1839             ndigits = 4;
1840           else if (before[i + 1] == 'U')
1841             ndigits = 8;
1842           else
1843             abort ();
1844
1845           /* Add the R6RS hex escape initial sequence.  */
1846           after[j] = '\\';
1847           after[j + 1] = 'x';
1848
1849           /* Move string positions to the start of the hex numbers.  */
1850           i += 2;
1851           j += 2;
1852
1853           /* Find the number of initial zeros in this hex number.  */
1854           nzeros = 0;
1855           while (before[i + nzeros] == '0' && nzeros < ndigits)
1856             nzeros++;
1857
1858           /* Copy the number, skipping initial zeros, and then move the string
1859              positions.  */
1860           if (nzeros == ndigits)
1861             {
1862               after[j] = '0';
1863               i += ndigits;
1864               j += 1;
1865             }
1866           else
1867             {
1868               int pos;
1869               for (pos = 0; pos < ndigits - nzeros; pos++)
1870                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1871               i += ndigits;
1872               j += (ndigits - nzeros);
1873             }
1874
1875           /* Add terminating semicolon.  */
1876           after[j] = ';';
1877           j++;
1878         }
1879       else
1880         {
1881           after[j] = before[i];
1882           i++;
1883           j++;
1884         }
1885     }
1886   *lenp = j;
1887   memcpy (before, after, j);
1888 }
1889
1890 char *
1891 scm_to_locale_string (SCM str)
1892 {
1893   return scm_to_locale_stringn (str, NULL);
1894 }
1895
1896 char *
1897 scm_to_locale_stringn (SCM str, size_t *lenp)
1898 {
1899   return scm_to_stringn (str, lenp,
1900                          locale_charset (),
1901                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1902 }
1903
1904 char *
1905 scm_to_latin1_string (SCM str)
1906 {
1907   return scm_to_latin1_stringn (str, NULL);
1908 }
1909
1910 char *
1911 scm_to_latin1_stringn (SCM str, size_t *lenp)
1912 #define FUNC_NAME "scm_to_latin1_stringn"
1913 {
1914   char *result;
1915
1916   SCM_VALIDATE_STRING (1, str);
1917
1918   if (scm_i_is_narrow_string (str))
1919     {
1920       size_t len = scm_i_string_length (str);
1921
1922       if (lenp)
1923         *lenp = len;
1924
1925       result = scm_strndup (scm_i_string_data (str), len);
1926     }
1927   else
1928     result = scm_to_stringn (str, lenp, NULL,
1929                              SCM_FAILED_CONVERSION_ERROR);
1930
1931   return result;
1932 }
1933 #undef FUNC_NAME
1934
1935 char *
1936 scm_to_utf8_string (SCM str)
1937 {
1938   return scm_to_utf8_stringn (str, NULL);
1939 }
1940
1941 static size_t
1942 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1943 {
1944   size_t ret, i;
1945   for (i = 0, ret = 0; i < len; i++)
1946     ret += (str[i] < 128) ? 1 : 2;
1947   return ret;
1948 }
1949
1950 static scm_t_uint8*
1951 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1952               scm_t_uint8 *u8_result, size_t *u8_lenp)
1953 {
1954   size_t i, n;
1955   size_t u8_len = latin1_u8_strlen (str, latin_len);
1956
1957   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1958     u8_result = scm_malloc (u8_len + 1);
1959   if (u8_lenp)
1960     *u8_lenp = u8_len;
1961
1962   for (i = 0, n = 0; i < latin_len; i++)
1963     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1964   if (n != u8_len)
1965     abort ();
1966   u8_result[n] = 0;
1967
1968   return u8_result;
1969 }
1970
1971 char *
1972 scm_to_utf8_stringn (SCM str, size_t *lenp)
1973 {
1974   if (scm_i_is_narrow_string (str))
1975     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
1976                                   scm_i_string_length (str),
1977                                   NULL, lenp);
1978   else
1979     return (char *) u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
1980                                scm_i_string_length (str),
1981                                NULL, lenp);
1982 }
1983
1984 scm_t_wchar *
1985 scm_to_utf32_string (SCM str)
1986 {
1987   return scm_to_utf32_stringn (str, NULL);
1988 }
1989
1990 scm_t_wchar *
1991 scm_to_utf32_stringn (SCM str, size_t *lenp)
1992 #define FUNC_NAME "scm_to_utf32_stringn"
1993 {
1994   scm_t_wchar *result;
1995
1996   SCM_VALIDATE_STRING (1, str);
1997
1998   if (scm_i_is_narrow_string (str))
1999     {
2000       scm_t_uint8 *codepoints;
2001       size_t i, len;
2002
2003       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2004       len = scm_i_string_length (str);
2005       if (lenp)
2006         *lenp = len;
2007
2008       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2009       for (i = 0; i < len; i++)
2010         result[i] = codepoints[i];
2011       result[len] = 0;
2012     }
2013   else
2014     {
2015       size_t len;
2016
2017       len = scm_i_string_length (str);
2018       if (lenp)
2019         *lenp = len;
2020
2021       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2022       memcpy (result, scm_i_string_wide_chars (str),
2023               len * sizeof (scm_t_wchar));
2024       result[len] = 0;
2025     }
2026
2027   return result;
2028 }
2029 #undef FUNC_NAME
2030
2031 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2032    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2033    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2034    defined by HANDLER.  */
2035 char *
2036 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2037                 scm_t_string_failed_conversion_handler handler)
2038 {
2039   char *buf;
2040   size_t ilen, len, i;
2041   int ret;
2042   const char *enc;
2043
2044   if (!scm_is_string (str))
2045     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2046   ilen = scm_i_string_length (str);
2047
2048   if (ilen == 0)
2049     {
2050       buf = scm_malloc (1);
2051       buf[0] = '\0';
2052       if (lenp)
2053         *lenp = 0;
2054       return buf;
2055     }
2056
2057   if (lenp == NULL)
2058     for (i = 0; i < ilen; i++)
2059       if (scm_i_string_ref (str, i) == '\0')
2060         scm_misc_error (NULL,
2061                         "string contains #\\nul character: ~S",
2062                         scm_list_1 (str));
2063
2064   if (scm_i_is_narrow_string (str) && (encoding == NULL))
2065     {
2066       /* If using native Latin-1 encoding, just copy the string
2067          contents.  */
2068       if (lenp)
2069         {
2070           buf = scm_malloc (ilen);
2071           memcpy (buf, scm_i_string_chars (str), ilen);
2072           *lenp = ilen;
2073           return buf;
2074         }
2075       else
2076         {
2077           buf = scm_malloc (ilen + 1);
2078           memcpy (buf, scm_i_string_chars (str), ilen);
2079           buf[ilen] = '\0';
2080           return buf;
2081         }
2082     }
2083
2084
2085   buf = NULL;
2086   len = 0;
2087   enc = encoding;
2088   if (enc == NULL)
2089     enc = "ISO-8859-1";
2090   if (scm_i_is_narrow_string (str))
2091     {
2092       scm_i_lock_iconv ();
2093       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2094                          "ISO-8859-1", enc,
2095                          (enum iconv_ilseq_handler) handler, NULL,
2096                          &buf, &len);
2097       scm_i_unlock_iconv ();
2098
2099       if (ret != 0)
2100         scm_encoding_error (__func__, errno,
2101                             "cannot convert narrow string to output locale",
2102                             SCM_BOOL_F,
2103                             /* FIXME: Faulty character unknown.  */
2104                             SCM_BOOL_F);
2105     }
2106   else
2107     {
2108       scm_i_lock_iconv ();
2109       buf = u32_conv_to_encoding (enc,
2110                                   (enum iconv_ilseq_handler) handler,
2111                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2112                                   ilen,
2113                                   NULL,
2114                                   NULL, &len);
2115       scm_i_unlock_iconv ();
2116       if (buf == NULL)
2117         scm_encoding_error (__func__, errno,
2118                             "cannot convert wide string to output locale",
2119                             SCM_BOOL_F,
2120                             /* FIXME: Faulty character unknown.  */
2121                             SCM_BOOL_F);
2122     }
2123   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2124     {
2125       if (SCM_R6RS_ESCAPES_P)
2126         {
2127           /* The worst case is if the input string contains all 4-digit
2128              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2129              (seven characters).  Make BUF large enough to hold
2130              that.  */
2131           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2132           unistring_escapes_to_r6rs_escapes (buf, &len);
2133         }
2134       else
2135         unistring_escapes_to_guile_escapes (buf, &len);
2136
2137       buf = scm_realloc (buf, len);
2138     }
2139   if (lenp)
2140     *lenp = len;
2141   else
2142     {
2143       buf = scm_realloc (buf, len + 1);
2144       buf[len] = '\0';
2145     }
2146
2147   scm_remember_upto_here_1 (str);
2148   return buf;
2149 }
2150
2151 size_t
2152 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2153 {
2154   size_t len;
2155   char *result = NULL;
2156   if (!scm_is_string (str))
2157     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2158   result = scm_to_locale_stringn (str, &len);
2159
2160   memcpy (buf, result, (len > max_len) ? max_len : len);
2161   free (result);
2162
2163   scm_remember_upto_here_1 (str);
2164   return len;
2165 }
2166
2167 \f
2168 /* Unicode string normalization.  */
2169
2170 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2171    libguile/i18n.c.  It would be useful to have this factored out into a more
2172    convenient location, but its use of alloca makes that tricky to do. */
2173
2174 static SCM
2175 normalize_str (SCM string, uninorm_t form)
2176 {
2177   SCM ret;
2178   scm_t_uint32 *w_str;
2179   scm_t_wchar *cbuf;
2180   size_t rlen, len = scm_i_string_length (string);
2181
2182   if (scm_i_is_narrow_string (string))
2183     {
2184       size_t i;
2185       const char *buf = scm_i_string_chars (string);
2186
2187       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2188
2189       for (i = 0; i < len; i ++)
2190         w_str[i] = (unsigned char) buf[i];
2191       w_str[len] = 0;
2192     }
2193   else
2194     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2195
2196   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2197
2198   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2199   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2200   free (w_str);
2201
2202   scm_i_try_narrow_string (ret);
2203
2204   return ret;
2205 }
2206
2207 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2208             (SCM string),
2209             "Returns the NFC normalized form of @var{string}.")
2210 #define FUNC_NAME s_scm_string_normalize_nfc
2211 {
2212   SCM_VALIDATE_STRING (1, string);
2213   return normalize_str (string, UNINORM_NFC);
2214 }
2215 #undef FUNC_NAME
2216
2217 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2218             (SCM string),
2219             "Returns the NFD normalized form of @var{string}.")
2220 #define FUNC_NAME s_scm_string_normalize_nfd
2221 {
2222   SCM_VALIDATE_STRING (1, string);
2223   return normalize_str (string, UNINORM_NFD);
2224 }
2225 #undef FUNC_NAME
2226
2227 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2228             (SCM string),
2229             "Returns the NFKC normalized form of @var{string}.")
2230 #define FUNC_NAME s_scm_string_normalize_nfkc
2231 {
2232   SCM_VALIDATE_STRING (1, string);
2233   return normalize_str (string, UNINORM_NFKC);
2234 }
2235 #undef FUNC_NAME
2236
2237 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2238             (SCM string),
2239             "Returns the NFKD normalized form of @var{string}.")
2240 #define FUNC_NAME s_scm_string_normalize_nfkd
2241 {
2242   SCM_VALIDATE_STRING (1, string);
2243   return normalize_str (string, UNINORM_NFKD);
2244 }
2245 #undef FUNC_NAME
2246
2247 /* converts C scm_array of strings to SCM scm_list of strings.
2248    If argc < 0, a null terminated scm_array is assumed.
2249    The current locale encoding is assumed */
2250 SCM
2251 scm_makfromstrs (int argc, char **argv)
2252 {
2253   int i = argc;
2254   SCM lst = SCM_EOL;
2255   if (0 > i)
2256     for (i = 0; argv[i]; i++);
2257   while (i--)
2258     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2259   return lst;
2260 }
2261
2262 /* Return a newly allocated array of char pointers to each of the strings
2263    in args, with a terminating NULL pointer.  The strings are encoded using
2264    the current locale. */
2265
2266 char **
2267 scm_i_allocate_string_pointers (SCM list)
2268 #define FUNC_NAME "scm_i_allocate_string_pointers"
2269 {
2270   char **result;
2271   int list_len = scm_ilength (list);
2272   int i;
2273
2274   if (list_len < 0)
2275     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2276
2277   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2278                           "string pointers");
2279   result[list_len] = NULL;
2280
2281   /* The list might have been modified in another thread, so
2282      we check LIST before each access.
2283    */
2284   for (i = 0; i < list_len && scm_is_pair (list); i++)
2285     {
2286       SCM str = SCM_CAR (list);
2287       size_t len;  /* String length in bytes */
2288       char *c_str = scm_to_locale_stringn (str, &len);
2289
2290       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2291          scm_malloc to allocate the returned string, which must be
2292          explicitly deallocated.  This forces us to copy the string a
2293          second time into a new buffer.  Ideally there would be variants
2294          of scm_to_*_stringn that can return garbage-collected buffers. */
2295
2296       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2297       memcpy (result[i], c_str, len);
2298       result[i][len] = '\0';
2299       free (c_str);
2300
2301       list = SCM_CDR (list);
2302     }
2303
2304   return result;
2305 }
2306 #undef FUNC_NAME
2307
2308 void
2309 scm_i_get_substring_spec (size_t len,
2310                           SCM start, size_t *cstart,
2311                           SCM end, size_t *cend)
2312 {
2313   if (SCM_UNBNDP (start))
2314     *cstart = 0;
2315   else
2316     *cstart = scm_to_unsigned_integer (start, 0, len);
2317
2318   if (SCM_UNBNDP (end))
2319     *cend = len;
2320   else
2321     *cend = scm_to_unsigned_integer (end, *cstart, len);
2322 }
2323
2324 static SCM
2325 string_handle_ref (scm_t_array_handle *h, size_t index)
2326 {
2327   return scm_c_string_ref (h->array, index);
2328 }
2329
2330 static void
2331 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2332 {
2333   scm_c_string_set_x (h->array, index, val);
2334 }
2335
2336 static void
2337 string_get_handle (SCM v, scm_t_array_handle *h)
2338 {
2339   h->array = v;
2340   h->ndims = 1;
2341   h->dims = &h->dim0;
2342   h->dim0.lbnd = 0;
2343   h->dim0.ubnd = scm_c_string_length (v) - 1;
2344   h->dim0.inc = 1;
2345   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2346   h->elements = h->writable_elements = NULL;
2347 }
2348
2349 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2350                           string_handle_ref, string_handle_set,
2351                           string_get_handle)
2352 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2353
2354 void
2355 scm_init_strings ()
2356 {
2357   scm_nullstr = scm_i_make_string (0, NULL, 0);
2358
2359   scm_i_pthread_atfork (scm_i_lock_iconv, scm_i_unlock_iconv,
2360                         scm_i_unlock_iconv);
2361
2362 #include "libguile/strings.x"
2363 }
2364
2365
2366 /*
2367   Local Variables:
2368   c-file-style: "gnu"
2369   End:
2370 */