libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 229
 230 \f
 231 /* Copy-on-write strings.
 232  */
 233
 234 #define STRING_TAG            scm_tc7_string
 235
 236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 237 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 238 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 239
 240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 242
 243 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 244
 245 /* Read-only strings.
 246  */
 247
 248 #define RO_STRING_TAG         scm_tc7_ro_string
 249 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 250
 251 /* Mutation-sharing substrings
 252  */
 253
 254 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 255
 256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 257 /* START and LENGTH as for STRINGs. */
 258
 259 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 260
 261 SCM scm_nullstr;
 262
 263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 264    characters.  CHARSP, if not NULL, will be set to location of the
 265    char array.  If READ_ONLY_P, the returned string is read-only;
 266    otherwise it is writable.  */
 267 SCM
 268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 269 {
 270   SCM buf = make_stringbuf (len);
 271   SCM res;
 272   if (charsp)
 273     *charsp = (char *) STRINGBUF_CHARS (buf);
 274   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 275                          SCM_UNPACK (buf),
 276                          (scm_t_bits) 0, (scm_t_bits) len);
 277   return res;
 278 }
 279
 280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 281    characters.  CHARSP, if not NULL, will be set to location of the
 282    character array.  If READ_ONLY_P, the returned string is read-only;
 283    otherwise it is writable.  */
 284 SCM
 285 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 286 {
 287   SCM buf = make_wide_stringbuf (len);
 288   SCM res;
 289   if (charsp)
 290     *charsp = STRINGBUF_WIDE_CHARS (buf);
 291   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 292                          SCM_UNPACK (buf),
 293                          (scm_t_bits) 0, (scm_t_bits) len);
 294   return res;
 295 }
 296
 297 static void
 298 validate_substring_args (SCM str, size_t start, size_t end)
 299 {
 300   if (!IS_STRING (str))
 301     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 302   if (start > STRING_LENGTH (str))
 303     scm_out_of_range (NULL, scm_from_size_t (start));
 304   if (end > STRING_LENGTH (str) || end < start)
 305     scm_out_of_range (NULL, scm_from_size_t (end));
 306 }
 307
 308 static inline void
 309 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 310 {
 311   *start = STRING_START (*str);
 312   if (IS_SH_STRING (*str))
 313     {
 314       *str = SH_STRING_STRING (*str);
 315       *start += STRING_START (*str);
 316     }
 317   *buf = STRING_STRINGBUF (*str);
 318 }
 319
 320 SCM
 321 scm_i_substring (SCM str, size_t start, size_t end)
 322 {
 323   SCM buf;
 324   size_t str_start;
 325   get_str_buf_start (&str, &buf, &str_start);
 326   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 327   SET_STRINGBUF_SHARED (buf);
 328   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 329   return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 330                           (scm_t_bits)str_start + start,
 331                           (scm_t_bits) end - start);
 332 }
 333
 334 SCM
 335 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 336 {
 337   SCM buf;
 338   size_t str_start;
 339   get_str_buf_start (&str, &buf, &str_start);
 340   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 341   SET_STRINGBUF_SHARED (buf);
 342   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 343   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 344                           (scm_t_bits)str_start + start,
 345                           (scm_t_bits) end - start);
 346 }
 347
 348 SCM
 349 scm_i_substring_copy (SCM str, size_t start, size_t end)
 350 {
 351   size_t len = end - start;
 352   SCM buf, my_buf;
 353   size_t str_start;
 354   get_str_buf_start (&str, &buf, &str_start);
 355   if (scm_i_is_narrow_string (str))
 356     {
 357       my_buf = make_stringbuf (len);
 358       memcpy (STRINGBUF_CHARS (my_buf),
 359               STRINGBUF_CHARS (buf) + str_start + start, len);
 360     }
 361   else
 362     {
 363       my_buf = make_wide_stringbuf (len);
 364       u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 365                (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 366                                  + start), len);
 367       /* Even though this string is wide, the substring may be narrow.
 368          Consider adding code to narrow the string.  */
 369     }
 370   scm_remember_upto_here_1 (buf);
 371   return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 372                           (scm_t_bits) 0, (scm_t_bits) len);
 373 }
 374
 375 SCM
 376 scm_i_substring_shared (SCM str, size_t start, size_t end)
 377 {
 378   if (start == 0 && end == STRING_LENGTH (str))
 379     return str;
 380   else
 381     {
 382       size_t len = end - start;
 383       if (IS_SH_STRING (str))
 384         {
 385           start += STRING_START (str);
 386           str = SH_STRING_STRING (str);
 387         }
 388       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 389                               (scm_t_bits)start, (scm_t_bits) len);
 390     }
 391 }
 392
 393 SCM
 394 scm_c_substring (SCM str, size_t start, size_t end)
 395 {
 396   validate_substring_args (str, start, end);
 397   return scm_i_substring (str, start, end);
 398 }
 399
 400 SCM
 401 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 402 {
 403   validate_substring_args (str, start, end);
 404   return scm_i_substring_read_only (str, start, end);
 405 }
 406
 407 SCM
 408 scm_c_substring_copy (SCM str, size_t start, size_t end)
 409 {
 410   validate_substring_args (str, start, end);
 411   return scm_i_substring_copy (str, start, end);
 412 }
 413
 414 SCM
 415 scm_c_substring_shared (SCM str, size_t start, size_t end)
 416 {
 417   validate_substring_args (str, start, end);
 418   return scm_i_substring_shared (str, start, end);
 419 }
 420
 421 \f
 422 /* Internal accessors
 423  */
 424
 425 /* Returns the number of characters in STR.  This may be different
 426    than the memory size of the string storage.  */
 427 size_t
 428 scm_i_string_length (SCM str)
 429 {
 430   return STRING_LENGTH (str);
 431 }
 432
 433 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 434    encoding.  False if it is 'wide', having a 32-bit UCS-4
 435    encoding.  */
 436 int
 437 scm_i_is_narrow_string (SCM str)
 438 {
 439   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 440 }
 441
 442 /* Try to coerce a string to be narrow.  It if is narrow already, do
 443    nothing.  If it is wide, shrink it to narrow if none of its
 444    characters are above 0xFF.  Return true if the string is narrow or
 445    was made to be narrow.  */
 446 int
 447 scm_i_try_narrow_string (SCM str)
 448 {
 449   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 450
 451   return scm_i_is_narrow_string (str);
 452 }
 453
 454 /* Return a pointer to the raw data of the string, which can be either Latin-1
 455    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 456 const void *
 457 scm_i_string_data (SCM str)
 458 {
 459   SCM buf;
 460   size_t start;
 461   const char *data;
 462
 463   get_str_buf_start (&str, &buf, &start);
 464
 465   data = STRINGBUF_CONTENTS (buf);
 466   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 467
 468   return data;
 469 }
 470
 471 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 472    STR.  */
 473 const char *
 474 scm_i_string_chars (SCM str)
 475 {
 476   SCM buf;
 477   size_t start;
 478   get_str_buf_start (&str, &buf, &start);
 479   if (scm_i_is_narrow_string (str))
 480     return (const char *) STRINGBUF_CHARS (buf) + start;
 481   else
 482     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 483                     scm_list_1 (str));
 484   return NULL;
 485 }
 486
 487 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 488    STR.  */
 489 const scm_t_wchar *
 490 scm_i_string_wide_chars (SCM str)
 491 {
 492   SCM buf;
 493   size_t start;
 494
 495   get_str_buf_start (&str, &buf, &start);
 496   if (!scm_i_is_narrow_string (str))
 497     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 498   else
 499     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 500                     scm_list_1 (str));
 501 }
 502
 503 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 504    a new string buffer, so that it can be modified without modifying
 505    other strings.  Also, lock the string mutex.  Later, one must call
 506    scm_i_string_stop_writing to unlock the mutex.  */
 507 SCM
 508 scm_i_string_start_writing (SCM orig_str)
 509 {
 510   SCM buf, str = orig_str;
 511   size_t start;
 512
 513   get_str_buf_start (&str, &buf, &start);
 514   if (IS_RO_STRING (str))
 515     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 516
 517   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 518   if (STRINGBUF_SHARED (buf))
 519     {
 520       /* Clone the stringbuf.  */
 521       size_t len = STRING_LENGTH (str);
 522       SCM new_buf;
 523
 524       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 525
 526       if (scm_i_is_narrow_string (str))
 527         {
 528           new_buf = make_stringbuf (len);
 529           memcpy (STRINGBUF_CHARS (new_buf),
 530                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 531
 532         }
 533       else
 534         {
 535           new_buf = make_wide_stringbuf (len);
 536           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 537                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 538                                      + STRING_START (str)), len);
 539         }
 540
 541       SET_STRING_STRINGBUF (str, new_buf);
 542       start -= STRING_START (str);
 543
 544       /* FIXME: The following operations are not atomic, so other threads
 545          looking at STR may see an inconsistent state.  Nevertheless it can't
 546          hurt much since (i) accessing STR while it is being mutated can't
 547          yield a crash, and (ii) concurrent accesses to STR should be
 548          protected by a mutex at the application level.  The latter may not
 549          apply when STR != ORIG_STR, though.  */
 550       SET_STRING_START (str, 0);
 551       SET_STRING_STRINGBUF (str, new_buf);
 552
 553       buf = new_buf;
 554
 555       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 556     }
 557   return orig_str;
 558 }
 559
 560 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 561 char *
 562 scm_i_string_writable_chars (SCM str)
 563 {
 564   SCM buf;
 565   size_t start;
 566
 567   get_str_buf_start (&str, &buf, &start);
 568   if (scm_i_is_narrow_string (str))
 569     return (char *) STRINGBUF_CHARS (buf) + start;
 570   else
 571     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 572                     scm_list_1 (str));
 573   return NULL;
 574 }
 575
 576 /* Return a pointer to the UCS-4 codepoints of a string.  */
 577 static scm_t_wchar *
 578 scm_i_string_writable_wide_chars (SCM str)
 579 {
 580   SCM buf;
 581   size_t start;
 582
 583   get_str_buf_start (&str, &buf, &start);
 584   if (!scm_i_is_narrow_string (str))
 585     return STRINGBUF_WIDE_CHARS (buf) + start;
 586   else
 587     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 588                     scm_list_1 (str));
 589 }
 590
 591 /* Unlock the string mutex that was locked when
 592    scm_i_string_start_writing was called.  */
 593 void
 594 scm_i_string_stop_writing (void)
 595 {
 596   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 597 }
 598
 599 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 600 scm_t_wchar
 601 scm_i_string_ref (SCM str, size_t x)
 602 {
 603   if (scm_i_is_narrow_string (str))
 604     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 605   else
 606     return scm_i_string_wide_chars (str)[x];
 607 }
 608
 609 /* Returns index+1 of the first char in STR that matches C, or
 610    0 if the char is not found.  */
 611 int
 612 scm_i_string_contains_char (SCM str, char ch)
 613 {
 614   size_t i;
 615   size_t len = scm_i_string_length (str);
 616
 617   i = 0;
 618   if (scm_i_is_narrow_string (str))
 619     {
 620       while (i < len)
 621         {
 622           if (scm_i_string_chars (str)[i] == ch)
 623             return i+1;
 624           i++;
 625         }
 626     }
 627   else
 628     {
 629       while (i < len)
 630         {
 631           if (scm_i_string_wide_chars (str)[i]
 632               == (unsigned char) ch)
 633             return i+1;
 634           i++;
 635         }
 636     }
 637   return 0;
 638 }
 639
 640 int
 641 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 642 {
 643   if (scm_i_is_narrow_string (sstr))
 644     {
 645       const char *a = scm_i_string_chars (sstr) + start_x;
 646       const char *b = cstr;
 647       return strncmp (a, b, strlen(b));
 648     }
 649   else
 650     {
 651       size_t i;
 652       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 653       const char *b = cstr;
 654       for (i = 0; i < strlen (b); i++)
 655         {
 656           if (a[i] != (unsigned char) b[i])
 657             return 1;
 658         }
 659     }
 660   return 0;
 661 }
 662
 663 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 664 void
 665 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 666 {
 667   if (chr > 0xFF && scm_i_is_narrow_string (str))
 668     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 669
 670   if (scm_i_is_narrow_string (str))
 671     {
 672       char *dst = scm_i_string_writable_chars (str);
 673       dst[p] = chr;
 674     }
 675   else
 676     {
 677       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 678       dst[p] = chr;
 679     }
 680 }
 681
 682 \f
 683 /* Symbols.
 684
 685    Basic symbol creation and accessing is done here, the rest is in
 686    symbols.[hc].  This has been done to keep stringbufs and the
 687    internals of strings and string-like objects confined to this file.
 688 */
 689
 690 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 691
 692 SCM
 693 scm_i_make_symbol (SCM name, scm_t_bits flags,
 694                    unsigned long hash, SCM props)
 695 {
 696   SCM buf;
 697   size_t start = STRING_START (name);
 698   size_t length = STRING_LENGTH (name);
 699
 700   if (IS_SH_STRING (name))
 701     {
 702       name = SH_STRING_STRING (name);
 703       start += STRING_START (name);
 704     }
 705   buf = SYMBOL_STRINGBUF (name);
 706
 707   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 708     {
 709       /* reuse buf. */
 710       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 711       SET_STRINGBUF_SHARED (buf);
 712       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 713     }
 714   else
 715     {
 716       /* make new buf. */
 717       if (scm_i_is_narrow_string (name))
 718         {
 719           SCM new_buf = make_stringbuf (length);
 720           memcpy (STRINGBUF_CHARS (new_buf),
 721                   STRINGBUF_CHARS (buf) + start, length);
 722           buf = new_buf;
 723         }
 724       else
 725         {
 726           SCM new_buf = make_wide_stringbuf (length);
 727           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 728                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 729                    length);
 730           buf = new_buf;
 731         }
 732     }
 733   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 734                           (scm_t_bits) hash, SCM_UNPACK (props));
 735 }
 736
 737 SCM
 738 scm_i_c_make_symbol (const char *name, size_t len,
 739                      scm_t_bits flags, unsigned long hash, SCM props)
 740 {
 741   SCM buf = make_stringbuf (len);
 742   memcpy (STRINGBUF_CHARS (buf), name, len);
 743
 744   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 745                           (scm_t_bits) hash, SCM_UNPACK (props));
 746 }
 747
 748 /* Returns the number of characters in SYM.  This may be different
 749    from the memory size of SYM.  */
 750 size_t
 751 scm_i_symbol_length (SCM sym)
 752 {
 753   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 754 }
 755
 756 size_t
 757 scm_c_symbol_length (SCM sym)
 758 #define FUNC_NAME "scm_c_symbol_length"
 759 {
 760   SCM_VALIDATE_SYMBOL (1, sym);
 761
 762   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 763 }
 764 #undef FUNC_NAME
 765
 766 /* True if the name of SYM is stored as a Latin-1 encoded string.
 767    False if it is stored as a 32-bit UCS-4-encoded string.  */
 768 int
 769 scm_i_is_narrow_symbol (SCM sym)
 770 {
 771   SCM buf;
 772
 773   buf = SYMBOL_STRINGBUF (sym);
 774   return !STRINGBUF_WIDE (buf);
 775 }
 776
 777 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 778    contains the name of SYM.  */
 779 const char *
 780 scm_i_symbol_chars (SCM sym)
 781 {
 782   SCM buf;
 783
 784   buf = SYMBOL_STRINGBUF (sym);
 785   if (!STRINGBUF_WIDE (buf))
 786     return (const char *) STRINGBUF_CHARS (buf);
 787   else
 788     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 789                     scm_list_1 (sym));
 790 }
 791
 792 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 793    symbol's name.  */
 794 const scm_t_wchar *
 795 scm_i_symbol_wide_chars (SCM sym)
 796 {
 797   SCM buf;
 798
 799   buf = SYMBOL_STRINGBUF (sym);
 800   if (STRINGBUF_WIDE (buf))
 801     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 802   else
 803     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 804                     scm_list_1 (sym));
 805 }
 806
 807 SCM
 808 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 809 {
 810   SCM buf = SYMBOL_STRINGBUF (sym);
 811   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 812   SET_STRINGBUF_SHARED (buf);
 813   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 814   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 815                           (scm_t_bits)start, (scm_t_bits) end - start);
 816 }
 817
 818 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 819 scm_t_wchar
 820 scm_i_symbol_ref (SCM sym, size_t x)
 821 {
 822   if (scm_i_is_narrow_symbol (sym))
 823     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 824   else
 825     return scm_i_symbol_wide_chars (sym)[x];
 826 }
 827
 828 /* Debugging
 829  */
 830
 831 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 832             "Returns an association list containing debugging information\n"
 833             "for @var{str}. The association list has the following entries."
 834             "@table @code\n"
 835             "@item string\n"
 836             "The string itself.\n"
 837             "@item start\n"
 838             "The start index of the string into its stringbuf\n"
 839             "@item length\n"
 840             "The length of the string\n"
 841             "@item shared\n"
 842             "If this string is a substring, it returns its parent string.\n"
 843             "Otherwise, it returns @code{#f}\n"
 844             "@item read-only\n"
 845             "@code{#t} if the string is read-only\n"
 846             "@item stringbuf-chars\n"
 847             "A new string containing this string's stringbuf's characters\n"
 848             "@item stringbuf-length\n"
 849             "The number of characters in this stringbuf\n"
 850             "@item stringbuf-shared\n"
 851             "@code{#t} if this stringbuf is shared\n"
 852             "@item stringbuf-wide\n"
 853             "@code{#t} if this stringbuf's characters are stored in a\n"
 854             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 855             "buffer\n"
 856             "@end table")
 857 #define FUNC_NAME s_scm_sys_string_dump
 858 {
 859   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 860   SCM buf;
 861   SCM_VALIDATE_STRING (1, str);
 862
 863   /* String info */
 864   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 865                  str);
 866   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 867                  scm_from_size_t (STRING_START (str)));
 868   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 869                  scm_from_size_t (STRING_LENGTH (str)));
 870
 871   if (IS_SH_STRING (str))
 872     {
 873       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 874                      SH_STRING_STRING (str));
 875       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 876     }
 877   else
 878     {
 879       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 880                      SCM_BOOL_F);
 881       buf = STRING_STRINGBUF (str);
 882     }
 883
 884   if (IS_RO_STRING (str))
 885     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 886                    SCM_BOOL_T);
 887   else
 888     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 889                    SCM_BOOL_F);
 890
 891   /* Stringbuf info */
 892   if (!STRINGBUF_WIDE (buf))
 893     {
 894       size_t len = STRINGBUF_LENGTH (buf);
 895       char *cbuf;
 896       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 897       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 898       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 899                      sbc);
 900     }
 901   else
 902     {
 903       size_t len = STRINGBUF_LENGTH (buf);
 904       scm_t_wchar *cbuf;
 905       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 906       u32_cpy ((scm_t_uint32 *) cbuf,
 907                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 908       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 909                      sbc);
 910     }
 911   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 912                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 913   if (STRINGBUF_SHARED (buf))
 914     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 915                    SCM_BOOL_T);
 916   else
 917     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 918                    SCM_BOOL_F);
 919   if (STRINGBUF_WIDE (buf))
 920     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 921                    SCM_BOOL_T);
 922   else
 923     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 924                    SCM_BOOL_F);
 925
 926   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 927 }
 928 #undef FUNC_NAME
 929
 930 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 931             "Returns an association list containing debugging information\n"
 932             "for @var{sym}. The association list has the following entries."
 933             "@table @code\n"
 934             "@item symbol\n"
 935             "The symbol itself\n"
 936             "@item hash\n"
 937             "Its hash value\n"
 938             "@item interned\n"
 939             "@code{#t} if it is an interned symbol\n"
 940             "@item stringbuf-chars\n"
 941             "A new string containing this symbols's stringbuf's characters\n"
 942             "@item stringbuf-length\n"
 943             "The number of characters in this stringbuf\n"
 944             "@item stringbuf-shared\n"
 945             "@code{#t} if this stringbuf is shared\n"
 946             "@item stringbuf-wide\n"
 947             "@code{#t} if this stringbuf's characters are stored in a\n"
 948             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 949             "buffer\n"
 950             "@end table")
 951 #define FUNC_NAME s_scm_sys_symbol_dump
 952 {
 953   SCM e1, e2, e3, e4, e5, e6, e7;
 954   SCM buf;
 955   SCM_VALIDATE_SYMBOL (1, sym);
 956   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
 957                  sym);
 958   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
 959                  scm_from_ulong (scm_i_symbol_hash (sym)));
 960   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
 961                  scm_symbol_interned_p (sym));
 962   buf = SYMBOL_STRINGBUF (sym);
 963
 964   /* Stringbuf info */
 965   if (!STRINGBUF_WIDE (buf))
 966     {
 967       size_t len = STRINGBUF_LENGTH (buf);
 968       char *cbuf;
 969       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 970       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 971       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 972                      sbc);
 973     }
 974   else
 975     {
 976       size_t len = STRINGBUF_LENGTH (buf);
 977       scm_t_wchar *cbuf;
 978       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 979       u32_cpy ((scm_t_uint32 *) cbuf,
 980                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 981       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 982                      sbc);
 983     }
 984   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 985                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 986   if (STRINGBUF_SHARED (buf))
 987     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 988                    SCM_BOOL_T);
 989   else
 990     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 991                    SCM_BOOL_F);
 992   if (STRINGBUF_WIDE (buf))
 993     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 994                     SCM_BOOL_T);
 995   else
 996     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 997                     SCM_BOOL_F);
 998   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
 999
1000 }
1001 #undef FUNC_NAME
1002
1003 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1004
1005 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1006 #define FUNC_NAME s_scm_sys_stringbuf_hist
1007 {
1008   int i;
1009   for (i = 0; i < 1000; i++)
1010     if (lenhist[i])
1011       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1012   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1013   return SCM_UNSPECIFIED;
1014 }
1015 #undef FUNC_NAME
1016
1017 #endif
1018
1019 \f
1020
1021 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1022             (SCM obj),
1023             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1024 #define FUNC_NAME s_scm_string_p
1025 {
1026   return scm_from_bool (IS_STRING (obj));
1027 }
1028 #undef FUNC_NAME
1029
1030
1031 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1032
1033 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1034             (SCM chrs),
1035             "@deffnx {Scheme Procedure} list->string chrs\n"
1036             "Return a newly allocated string composed of the arguments,\n"
1037             "@var{chrs}.")
1038 #define FUNC_NAME s_scm_string
1039 {
1040   SCM result = SCM_BOOL_F;
1041   SCM rest;
1042   size_t len;
1043   size_t p = 0;
1044   long i;
1045   int wide = 0;
1046
1047   /* Verify that this is a list of chars.  */
1048   i = scm_ilength (chrs);
1049   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1050
1051   len = (size_t) i;
1052   rest = chrs;
1053
1054   while (len > 0 && scm_is_pair (rest))
1055     {
1056       SCM elt = SCM_CAR (rest);
1057       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1058       if (SCM_CHAR (elt) > 0xFF)
1059         wide = 1;
1060       rest = SCM_CDR (rest);
1061       len--;
1062       scm_remember_upto_here_1 (elt);
1063     }
1064
1065   /* Construct a string containing this list of chars.  */
1066   len = (size_t) i;
1067   rest = chrs;
1068
1069   if (wide == 0)
1070     {
1071       char *buf;
1072
1073       result = scm_i_make_string (len, NULL, 0);
1074       result = scm_i_string_start_writing (result);
1075       buf = scm_i_string_writable_chars (result);
1076       while (len > 0 && scm_is_pair (rest))
1077         {
1078           SCM elt = SCM_CAR (rest);
1079           buf[p] = (unsigned char) SCM_CHAR (elt);
1080           p++;
1081           rest = SCM_CDR (rest);
1082           len--;
1083           scm_remember_upto_here_1 (elt);
1084         }
1085     }
1086   else
1087     {
1088       scm_t_wchar *buf;
1089
1090       result = scm_i_make_wide_string (len, NULL, 0);
1091       result = scm_i_string_start_writing (result);
1092       buf = scm_i_string_writable_wide_chars (result);
1093       while (len > 0 && scm_is_pair (rest))
1094         {
1095           SCM elt = SCM_CAR (rest);
1096           buf[p] = SCM_CHAR (elt);
1097           p++;
1098           rest = SCM_CDR (rest);
1099           len--;
1100           scm_remember_upto_here_1 (elt);
1101         }
1102     }
1103   scm_i_string_stop_writing ();
1104
1105   if (len > 0)
1106     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1107   if (!scm_is_null (rest))
1108     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1109
1110   return result;
1111 }
1112 #undef FUNC_NAME
1113
1114 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1115             (SCM k, SCM chr),
1116             "Return a newly allocated string of\n"
1117             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1118             "the string are initialized to @var{chr}, otherwise the contents\n"
1119             "of the @var{string} are all set to @var{#\nul}.")
1120 #define FUNC_NAME s_scm_make_string
1121 {
1122   return scm_c_make_string (scm_to_size_t (k), chr);
1123 }
1124 #undef FUNC_NAME
1125
1126 SCM
1127 scm_c_make_string (size_t len, SCM chr)
1128 #define FUNC_NAME NULL
1129 {
1130   size_t p;
1131   char *contents = NULL;
1132   SCM res = scm_i_make_string (len, &contents, 0);
1133
1134   /* If no char is given, initialize string contents to NULL.  */
1135   if (SCM_UNBNDP (chr))
1136     memset (contents, 0, len);
1137   else
1138     {
1139       SCM_VALIDATE_CHAR (0, chr);
1140       res = scm_i_string_start_writing (res);
1141       for (p = 0; p < len; p++)
1142         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1143       scm_i_string_stop_writing ();
1144     }
1145
1146   return res;
1147 }
1148 #undef FUNC_NAME
1149
1150 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1151             (SCM string),
1152             "Return the number of characters in @var{string}.")
1153 #define FUNC_NAME s_scm_string_length
1154 {
1155   SCM_VALIDATE_STRING (1, string);
1156   return scm_from_size_t (STRING_LENGTH (string));
1157 }
1158 #undef FUNC_NAME
1159
1160 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1161             (SCM string),
1162             "Return the bytes used to represent a character in @var{string}."
1163             "This will return 1 or 4.")
1164 #define FUNC_NAME s_scm_string_bytes_per_char
1165 {
1166   SCM_VALIDATE_STRING (1, string);
1167   if (!scm_i_is_narrow_string (string))
1168     return scm_from_int (4);
1169
1170   return scm_from_int (1);
1171 }
1172 #undef FUNC_NAME
1173
1174 size_t
1175 scm_c_string_length (SCM string)
1176 {
1177   if (!IS_STRING (string))
1178     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1179   return STRING_LENGTH (string);
1180 }
1181
1182 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1183             (SCM str, SCM k),
1184             "Return character @var{k} of @var{str} using zero-origin\n"
1185             "indexing. @var{k} must be a valid index of @var{str}.")
1186 #define FUNC_NAME s_scm_string_ref
1187 {
1188   size_t len;
1189   unsigned long idx;
1190
1191   SCM_VALIDATE_STRING (1, str);
1192
1193   len = scm_i_string_length (str);
1194   if (SCM_LIKELY (len > 0))
1195     idx = scm_to_unsigned_integer (k, 0, len - 1);
1196   else
1197     scm_out_of_range (NULL, k);
1198
1199   if (scm_i_is_narrow_string (str))
1200     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1201   else
1202     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1203 }
1204 #undef FUNC_NAME
1205
1206 SCM
1207 scm_c_string_ref (SCM str, size_t p)
1208 {
1209   if (p >= scm_i_string_length (str))
1210     scm_out_of_range (NULL, scm_from_size_t (p));
1211   if (scm_i_is_narrow_string (str))
1212     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1213   else
1214     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1215
1216 }
1217
1218 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1219             (SCM str, SCM k, SCM chr),
1220             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1221             "an unspecified value. @var{k} must be a valid index of\n"
1222             "@var{str}.")
1223 #define FUNC_NAME s_scm_string_set_x
1224 {
1225   size_t len;
1226   unsigned long idx;
1227
1228   SCM_VALIDATE_STRING (1, str);
1229
1230   len = scm_i_string_length (str);
1231   if (SCM_LIKELY (len > 0))
1232     idx = scm_to_unsigned_integer (k, 0, len - 1);
1233   else
1234     scm_out_of_range (NULL, k);
1235
1236   SCM_VALIDATE_CHAR (3, chr);
1237   str = scm_i_string_start_writing (str);
1238   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1239   scm_i_string_stop_writing ();
1240
1241   return SCM_UNSPECIFIED;
1242 }
1243 #undef FUNC_NAME
1244
1245 void
1246 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1247 {
1248   if (p >= scm_i_string_length (str))
1249     scm_out_of_range (NULL, scm_from_size_t (p));
1250   str = scm_i_string_start_writing (str);
1251   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1252   scm_i_string_stop_writing ();
1253 }
1254
1255 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1256             (SCM str, SCM start, SCM end),
1257             "Return a newly allocated string formed from the characters\n"
1258             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1259             "ending with index @var{end} (exclusive).\n"
1260             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1261             "exact integers satisfying:\n\n"
1262             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1263 #define FUNC_NAME s_scm_substring
1264 {
1265   size_t len, from, to;
1266
1267   SCM_VALIDATE_STRING (1, str);
1268   len = scm_i_string_length (str);
1269   from = scm_to_unsigned_integer (start, 0, len);
1270   if (SCM_UNBNDP (end))
1271     to = len;
1272   else
1273     to = scm_to_unsigned_integer (end, from, len);
1274   return scm_i_substring (str, from, to);
1275 }
1276 #undef FUNC_NAME
1277
1278 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1279             (SCM str, SCM start, SCM end),
1280             "Return a newly allocated string formed from the characters\n"
1281             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1282             "ending with index @var{end} (exclusive).\n"
1283             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1284             "exact integers satisfying:\n"
1285             "\n"
1286             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1287             "\n"
1288             "The returned string is read-only.\n")
1289 #define FUNC_NAME s_scm_substring_read_only
1290 {
1291   size_t len, from, to;
1292
1293   SCM_VALIDATE_STRING (1, str);
1294   len = scm_i_string_length (str);
1295   from = scm_to_unsigned_integer (start, 0, len);
1296   if (SCM_UNBNDP (end))
1297     to = len;
1298   else
1299     to = scm_to_unsigned_integer (end, from, len);
1300   return scm_i_substring_read_only (str, from, to);
1301 }
1302 #undef FUNC_NAME
1303
1304 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1305             (SCM str, SCM start, SCM end),
1306             "Return a newly allocated string formed from the characters\n"
1307             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308             "ending with index @var{end} (exclusive).\n"
1309             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310             "exact integers satisfying:\n\n"
1311             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312 #define FUNC_NAME s_scm_substring_copy
1313 {
1314   /* For the Scheme version, START is mandatory, but for the C
1315      version, it is optional.  See scm_string_copy in srfi-13.c for a
1316      rationale.
1317   */
1318
1319   size_t from, to;
1320
1321   SCM_VALIDATE_STRING (1, str);
1322   scm_i_get_substring_spec (scm_i_string_length (str),
1323                             start, &from, end, &to);
1324   return scm_i_substring_copy (str, from, to);
1325 }
1326 #undef FUNC_NAME
1327
1328 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1329             (SCM str, SCM start, SCM end),
1330             "Return string that indirectly refers to the characters\n"
1331             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1332             "ending with index @var{end} (exclusive).\n"
1333             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1334             "exact integers satisfying:\n\n"
1335             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1336 #define FUNC_NAME s_scm_substring_shared
1337 {
1338   size_t len, from, to;
1339
1340   SCM_VALIDATE_STRING (1, str);
1341   len = scm_i_string_length (str);
1342   from = scm_to_unsigned_integer (start, 0, len);
1343   if (SCM_UNBNDP (end))
1344     to = len;
1345   else
1346     to = scm_to_unsigned_integer (end, from, len);
1347   return scm_i_substring_shared (str, from, to);
1348 }
1349 #undef FUNC_NAME
1350
1351 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1352             (SCM args),
1353             "Return a newly allocated string whose characters form the\n"
1354             "concatenation of the given strings, @var{args}.")
1355 #define FUNC_NAME s_scm_string_append
1356 {
1357   SCM res;
1358   size_t len = 0;
1359   int wide = 0;
1360   SCM l, s;
1361   size_t i;
1362   union
1363   {
1364     char *narrow;
1365     scm_t_wchar *wide;
1366   } data;
1367
1368   SCM_VALIDATE_REST_ARGUMENT (args);
1369   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1370     {
1371       s = SCM_CAR (l);
1372       SCM_VALIDATE_STRING (SCM_ARGn, s);
1373       len += scm_i_string_length (s);
1374       if (!scm_i_is_narrow_string (s))
1375         wide = 1;
1376     }
1377   data.narrow = NULL;
1378   if (!wide)
1379     res = scm_i_make_string (len, &data.narrow, 0);
1380   else
1381     res = scm_i_make_wide_string (len, &data.wide, 0);
1382
1383   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1384     {
1385       size_t len;
1386       s = SCM_CAR (l);
1387       SCM_VALIDATE_STRING (SCM_ARGn, s);
1388       len = scm_i_string_length (s);
1389       if (!wide)
1390         {
1391           memcpy (data.narrow, scm_i_string_chars (s), len);
1392           data.narrow += len;
1393         }
1394       else
1395         {
1396           if (scm_i_is_narrow_string (s))
1397             {
1398               for (i = 0; i < scm_i_string_length (s); i++)
1399                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1400             }
1401           else
1402             u32_cpy ((scm_t_uint32 *) data.wide,
1403                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1404           data.wide += len;
1405         }
1406       scm_remember_upto_here_1 (s);
1407     }
1408   return res;
1409 }
1410 #undef FUNC_NAME
1411
1412
1413 \f
1414 /* Charset conversion error handling.  */
1415
1416 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1417 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1418
1419 /* Raise an exception informing that character CHR could not be written
1420    to PORT in its current encoding.  */
1421 void
1422 scm_encoding_error (const char *subr, int err, const char *message,
1423                     SCM port, SCM chr)
1424 {
1425   scm_throw (scm_encoding_error_key,
1426              scm_list_n (scm_from_latin1_string (subr),
1427                          scm_from_latin1_string (message),
1428                          scm_from_int (err),
1429                          port, chr,
1430                          SCM_UNDEFINED));
1431 }
1432
1433 /* Raise an exception informing of an encoding error on PORT.  This
1434    means that a character could not be written in PORT's encoding.  */
1435 void
1436 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1437 {
1438   scm_throw (scm_decoding_error_key,
1439              scm_list_n (scm_from_latin1_string (subr),
1440                          scm_from_latin1_string (message),
1441                          scm_from_int (err),
1442                          port,
1443                          SCM_UNDEFINED));
1444 }
1445
1446 \f
1447 /* String conversion to/from C.  */
1448
1449 static void
1450 decoding_error (const char *func_name, int errno_save,
1451                 const char *str, size_t len)
1452 {
1453   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1454      handler.  */
1455   SCM bv;
1456   signed char *buf;
1457
1458   buf = scm_gc_malloc_pointerless (len, "bytevector");
1459   memcpy (buf, str, len);
1460   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1461
1462   scm_decoding_error (func_name, errno_save,
1463                       "input locale conversion error", bv);
1464 }
1465
1466 SCM
1467 scm_from_stringn (const char *str, size_t len, const char *encoding,
1468                   scm_t_string_failed_conversion_handler handler)
1469 {
1470   size_t u32len, i;
1471   scm_t_wchar *u32;
1472   int wide = 0;
1473   SCM res;
1474
1475   /* The order of these checks is important. */
1476   if (!str && len != 0)
1477     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1478   if (len == (size_t) -1)
1479     len = strlen (str);
1480   if (len == 0)
1481     return scm_nullstr;
1482
1483   if (encoding == NULL)
1484     {
1485       /* If encoding is null, use Latin-1.  */
1486       char *buf;
1487       res = scm_i_make_string (len, &buf, 0);
1488       memcpy (buf, str, len);
1489       return res;
1490     }
1491
1492   u32len = 0;
1493   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1494                                                 (enum iconv_ilseq_handler)
1495                                                 handler,
1496                                                 str, len,
1497                                                 NULL,
1498                                                 NULL, &u32len);
1499
1500   if (SCM_UNLIKELY (u32 == NULL))
1501     decoding_error (__func__, errno, str, len);
1502
1503   i = 0;
1504   while (i < u32len)
1505     if (u32[i++] > 0xFF)
1506       {
1507         wide = 1;
1508         break;
1509       }
1510
1511   if (!wide)
1512     {
1513       char *dst;
1514       res = scm_i_make_string (u32len, &dst, 0);
1515       for (i = 0; i < u32len; i ++)
1516         dst[i] = (unsigned char) u32[i];
1517       dst[u32len] = '\0';
1518     }
1519   else
1520     {
1521       scm_t_wchar *wdst;
1522       res = scm_i_make_wide_string (u32len, &wdst, 0);
1523       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1524       wdst[u32len] = 0;
1525     }
1526
1527   free (u32);
1528   return res;
1529 }
1530
1531 SCM
1532 scm_from_locale_string (const char *str)
1533 {
1534   return scm_from_locale_stringn (str, -1);
1535 }
1536
1537 SCM
1538 scm_from_locale_stringn (const char *str, size_t len)
1539 {
1540   return scm_from_stringn (str, len, locale_charset (),
1541                            scm_i_get_conversion_strategy (SCM_BOOL_F));
1542 }
1543
1544 SCM
1545 scm_from_latin1_string (const char *str)
1546 {
1547   return scm_from_latin1_stringn (str, -1);
1548 }
1549
1550 SCM
1551 scm_from_latin1_stringn (const char *str, size_t len)
1552 {
1553   char *buf;
1554   SCM result;
1555
1556   if (len == (size_t) -1)
1557     len = strlen (str);
1558
1559   /* Make a narrow string and copy STR as is.  */
1560   result = scm_i_make_string (len, &buf, 0);
1561   memcpy (buf, str, len);
1562
1563   return result;
1564 }
1565
1566 SCM
1567 scm_from_utf8_string (const char *str)
1568 {
1569   return scm_from_utf8_stringn (str, -1);
1570 }
1571
1572 SCM
1573 scm_from_utf8_stringn (const char *str, size_t len)
1574 {
1575   size_t i, char_len;
1576   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1577   int ascii = 1, narrow = 1;
1578   SCM res;
1579
1580   if (len == (size_t) -1)
1581     len = strlen (str);
1582
1583   i = 0;
1584   char_len = 0;
1585
1586   while (i < len)
1587     {
1588       if (ustr[i] <= 127)
1589         {
1590           char_len++;
1591           i++;
1592         }
1593       else
1594         {
1595           ucs4_t c;
1596           int nbytes;
1597
1598           ascii = 0;
1599
1600           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1601
1602           if (nbytes < 0)
1603             /* Bad UTF-8.  */
1604             decoding_error (__func__, errno, str, len);
1605
1606           if (c > 255)
1607             narrow = 0;
1608
1609           char_len++;
1610           i += nbytes;
1611         }
1612     }
1613
1614   if (ascii)
1615     {
1616       char *dst;
1617       res = scm_i_make_string (char_len, &dst, 0);
1618       memcpy (dst, str, len);
1619     }
1620   else if (narrow)
1621     {
1622       char *dst;
1623       size_t j;
1624       ucs4_t c;
1625
1626       res = scm_i_make_string (char_len, &dst, 0);
1627
1628       for (i = 0, j = 0; i < len; i++, j++)
1629         {
1630           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1631           dst[j] = (signed char) c;
1632         }
1633     }
1634   else
1635     {
1636       scm_t_wchar *dst;
1637       size_t j;
1638       ucs4_t c;
1639
1640       res = scm_i_make_wide_string (char_len, &dst, 0);
1641
1642       for (i = 0, j = 0; i < len; i++, j++)
1643         {
1644           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1645           dst[j] = c;
1646         }
1647     }
1648
1649   return res;
1650 }
1651
1652 SCM
1653 scm_from_utf32_string (const scm_t_wchar *str)
1654 {
1655   return scm_from_utf32_stringn (str, -1);
1656 }
1657
1658 SCM
1659 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1660 {
1661   SCM result;
1662   scm_t_wchar *buf;
1663
1664   if (len == (size_t) -1)
1665     len = u32_strlen ((uint32_t *) str);
1666
1667   result = scm_i_make_wide_string (len, &buf, 0);
1668   memcpy (buf, str, len * sizeof (scm_t_wchar));
1669   scm_i_try_narrow_string (result);
1670
1671   return result;
1672 }
1673
1674 /* Create a new scheme string from the C string STR.  The memory of
1675    STR may be used directly as storage for the new string.  */
1676 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1677    would be to register a finalizer to eventually free(3) STR, which isn't
1678    worth it.  Should we just deprecate the `scm_take_' functions?  */
1679 SCM
1680 scm_take_locale_stringn (char *str, size_t len)
1681 {
1682   SCM res;
1683
1684   res = scm_from_locale_stringn (str, len);
1685   free (str);
1686
1687   return res;
1688 }
1689
1690 SCM
1691 scm_take_locale_string (char *str)
1692 {
1693   return scm_take_locale_stringn (str, -1);
1694 }
1695
1696 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1697    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1698    Set *LENP to the size of the resulting string.
1699
1700    FIXME: This is a hack we should get rid of.  See
1701    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1702    for details.  */
1703 static void
1704 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1705 {
1706   char *before, *after;
1707   size_t i, j;
1708
1709   before = buf;
1710   after = buf;
1711   i = 0;
1712   j = 0;
1713   while (i < *lenp)
1714     {
1715       if ((i <= *lenp - 6)
1716           && before[i] == '\\'
1717           && before[i + 1] == 'u'
1718           && before[i + 2] == '0' && before[i + 3] == '0')
1719         {
1720           /* Convert \u00NN to \xNN */
1721           after[j] = '\\';
1722           after[j + 1] = 'x';
1723           after[j + 2] = tolower ((int) before[i + 4]);
1724           after[j + 3] = tolower ((int) before[i + 5]);
1725           i += 6;
1726           j += 4;
1727         }
1728       else if ((i <= *lenp - 10)
1729                && before[i] == '\\'
1730                && before[i + 1] == 'U'
1731                && before[i + 2] == '0' && before[i + 3] == '0')
1732         {
1733           /* Convert \U00NNNNNN to \UNNNNNN */
1734           after[j] = '\\';
1735           after[j + 1] = 'U';
1736           after[j + 2] = tolower ((int) before[i + 4]);
1737           after[j + 3] = tolower ((int) before[i + 5]);
1738           after[j + 4] = tolower ((int) before[i + 6]);
1739           after[j + 5] = tolower ((int) before[i + 7]);
1740           after[j + 6] = tolower ((int) before[i + 8]);
1741           after[j + 7] = tolower ((int) before[i + 9]);
1742           i += 10;
1743           j += 8;
1744         }
1745       else
1746         {
1747           after[j] = before[i];
1748           i++;
1749           j++;
1750         }
1751     }
1752   *lenp = j;
1753 }
1754
1755 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1756    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1757    of the resulting string.  BUF must be large enough to handle the
1758    worst case when `\uXXXX' escapes (6 characters) are replaced by
1759    `\xXXXX;' (7 characters).  */
1760 static void
1761 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1762 {
1763   char *before, *after;
1764   size_t i, j;
1765   /* The worst case is if the input string contains all 4-digit hex escapes.
1766      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1767   size_t max_out_len = (*lenp * 7) / 6 + 1;
1768   size_t nzeros, ndigits;
1769
1770   before = buf;
1771   after = alloca (max_out_len);
1772   i = 0;
1773   j = 0;
1774   while (i < *lenp)
1775     {
1776       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1777           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1778         {
1779           if (before[i + 1] == 'u')
1780             ndigits = 4;
1781           else if (before[i + 1] == 'U')
1782             ndigits = 8;
1783           else
1784             abort ();
1785
1786           /* Add the R6RS hex escape initial sequence.  */
1787           after[j] = '\\';
1788           after[j + 1] = 'x';
1789
1790           /* Move string positions to the start of the hex numbers.  */
1791           i += 2;
1792           j += 2;
1793
1794           /* Find the number of initial zeros in this hex number.  */
1795           nzeros = 0;
1796           while (before[i + nzeros] == '0' && nzeros < ndigits)
1797             nzeros++;
1798
1799           /* Copy the number, skipping initial zeros, and then move the string
1800              positions.  */
1801           if (nzeros == ndigits)
1802             {
1803               after[j] = '0';
1804               i += ndigits;
1805               j += 1;
1806             }
1807           else
1808             {
1809               int pos;
1810               for (pos = 0; pos < ndigits - nzeros; pos++)
1811                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1812               i += ndigits;
1813               j += (ndigits - nzeros);
1814             }
1815
1816           /* Add terminating semicolon.  */
1817           after[j] = ';';
1818           j++;
1819         }
1820       else
1821         {
1822           after[j] = before[i];
1823           i++;
1824           j++;
1825         }
1826     }
1827   *lenp = j;
1828   memcpy (before, after, j);
1829 }
1830
1831 char *
1832 scm_to_locale_string (SCM str)
1833 {
1834   return scm_to_locale_stringn (str, NULL);
1835 }
1836
1837 char *
1838 scm_to_locale_stringn (SCM str, size_t *lenp)
1839 {
1840   return scm_to_stringn (str, lenp,
1841                          locale_charset (),
1842                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1843 }
1844
1845 char *
1846 scm_to_latin1_string (SCM str)
1847 {
1848   return scm_to_latin1_stringn (str, NULL);
1849 }
1850
1851 char *
1852 scm_to_latin1_stringn (SCM str, size_t *lenp)
1853 #define FUNC_NAME "scm_to_latin1_stringn"
1854 {
1855   char *result;
1856
1857   SCM_VALIDATE_STRING (1, str);
1858
1859   if (scm_i_is_narrow_string (str))
1860     {
1861       size_t len = scm_i_string_length (str);
1862
1863       if (lenp)
1864         *lenp = len;
1865
1866       result = scm_strndup (scm_i_string_data (str), len);
1867     }
1868   else
1869     result = scm_to_stringn (str, lenp, NULL,
1870                              SCM_FAILED_CONVERSION_ERROR);
1871
1872   return result;
1873 }
1874 #undef FUNC_NAME
1875
1876 char *
1877 scm_to_utf8_string (SCM str)
1878 {
1879   return scm_to_utf8_stringn (str, NULL);
1880 }
1881
1882 char *
1883 scm_to_utf8_stringn (SCM str, size_t *lenp)
1884 {
1885   return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1886 }
1887
1888 scm_t_wchar *
1889 scm_to_utf32_string (SCM str)
1890 {
1891   return scm_to_utf32_stringn (str, NULL);
1892 }
1893
1894 scm_t_wchar *
1895 scm_to_utf32_stringn (SCM str, size_t *lenp)
1896 #define FUNC_NAME "scm_to_utf32_stringn"
1897 {
1898   scm_t_wchar *result;
1899
1900   SCM_VALIDATE_STRING (1, str);
1901
1902   if (scm_i_is_narrow_string (str))
1903     result = (scm_t_wchar *)
1904       scm_to_stringn (str, lenp, "UTF-32",
1905                       SCM_FAILED_CONVERSION_ERROR);
1906   else
1907     {
1908       size_t len;
1909
1910       len = scm_i_string_length (str);
1911       if (lenp)
1912         *lenp = len;
1913
1914       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1915       memcpy (result, scm_i_string_wide_chars (str),
1916               len * sizeof (scm_t_wchar));
1917       result[len] = 0;
1918     }
1919
1920   return result;
1921 }
1922 #undef FUNC_NAME
1923
1924 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1925    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
1926    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
1927    defined by HANDLER.  */
1928 char *
1929 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1930                 scm_t_string_failed_conversion_handler handler)
1931 {
1932   char *buf;
1933   size_t ilen, len, i;
1934   int ret;
1935   const char *enc;
1936
1937   if (!scm_is_string (str))
1938     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1939   ilen = scm_i_string_length (str);
1940
1941   if (ilen == 0)
1942     {
1943       buf = scm_malloc (1);
1944       buf[0] = '\0';
1945       if (lenp)
1946         *lenp = 0;
1947       return buf;
1948     }
1949
1950   if (lenp == NULL)
1951     for (i = 0; i < ilen; i++)
1952       if (scm_i_string_ref (str, i) == '\0')
1953         scm_misc_error (NULL,
1954                         "string contains #\\nul character: ~S",
1955                         scm_list_1 (str));
1956
1957   if (scm_i_is_narrow_string (str) && (encoding == NULL))
1958     {
1959       /* If using native Latin-1 encoding, just copy the string
1960          contents.  */
1961       if (lenp)
1962         {
1963           buf = scm_malloc (ilen);
1964           memcpy (buf, scm_i_string_chars (str), ilen);
1965           *lenp = ilen;
1966           return buf;
1967         }
1968       else
1969         {
1970           buf = scm_malloc (ilen + 1);
1971           memcpy (buf, scm_i_string_chars (str), ilen);
1972           buf[ilen] = '\0';
1973           return buf;
1974         }
1975     }
1976
1977
1978   buf = NULL;
1979   len = 0;
1980   enc = encoding;
1981   if (enc == NULL)
1982     enc = "ISO-8859-1";
1983   if (scm_i_is_narrow_string (str))
1984     {
1985       ret = mem_iconveh (scm_i_string_chars (str), ilen,
1986                          "ISO-8859-1", enc,
1987                          (enum iconv_ilseq_handler) handler, NULL,
1988                          &buf, &len);
1989
1990       if (ret != 0)
1991         scm_encoding_error (__func__, errno,
1992                             "cannot convert narrow string to output locale",
1993                             SCM_BOOL_F,
1994                             /* FIXME: Faulty character unknown.  */
1995                             SCM_BOOL_F);
1996     }
1997   else
1998     {
1999       buf = u32_conv_to_encoding (enc,
2000                                   (enum iconv_ilseq_handler) handler,
2001                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2002                                   ilen,
2003                                   NULL,
2004                                   NULL, &len);
2005       if (buf == NULL)
2006         scm_encoding_error (__func__, errno,
2007                             "cannot convert wide string to output locale",
2008                             SCM_BOOL_F,
2009                             /* FIXME: Faulty character unknown.  */
2010                             SCM_BOOL_F);
2011     }
2012   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2013     {
2014       if (SCM_R6RS_ESCAPES_P)
2015         {
2016           /* The worst case is if the input string contains all 4-digit
2017              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2018              (seven characters).  Make BUF large enough to hold
2019              that.  */
2020           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2021           unistring_escapes_to_r6rs_escapes (buf, &len);
2022         }
2023       else
2024         unistring_escapes_to_guile_escapes (buf, &len);
2025
2026       buf = scm_realloc (buf, len);
2027     }
2028   if (lenp)
2029     *lenp = len;
2030   else
2031     {
2032       buf = scm_realloc (buf, len + 1);
2033       buf[len] = '\0';
2034     }
2035
2036   scm_remember_upto_here_1 (str);
2037   return buf;
2038 }
2039
2040 size_t
2041 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2042 {
2043   size_t len;
2044   char *result = NULL;
2045   if (!scm_is_string (str))
2046     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2047   result = scm_to_locale_stringn (str, &len);
2048
2049   memcpy (buf, result, (len > max_len) ? max_len : len);
2050   free (result);
2051
2052   scm_remember_upto_here_1 (str);
2053   return len;
2054 }
2055
2056 \f
2057 /* Unicode string normalization.  */
2058
2059 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2060    libguile/i18n.c.  It would be useful to have this factored out into a more
2061    convenient location, but its use of alloca makes that tricky to do. */
2062
2063 static SCM
2064 normalize_str (SCM string, uninorm_t form)
2065 {
2066   SCM ret;
2067   scm_t_uint32 *w_str;
2068   scm_t_wchar *cbuf;
2069   size_t rlen, len = scm_i_string_length (string);
2070
2071   if (scm_i_is_narrow_string (string))
2072     {
2073       size_t i;
2074       const char *buf = scm_i_string_chars (string);
2075
2076       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2077
2078       for (i = 0; i < len; i ++)
2079         w_str[i] = (unsigned char) buf[i];
2080       w_str[len] = 0;
2081     }
2082   else
2083     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2084
2085   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2086
2087   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2088   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2089   free (w_str);
2090
2091   scm_i_try_narrow_string (ret);
2092
2093   return ret;
2094 }
2095
2096 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2097             (SCM string),
2098             "Returns the NFC normalized form of @var{string}.")
2099 #define FUNC_NAME s_scm_string_normalize_nfc
2100 {
2101   SCM_VALIDATE_STRING (1, string);
2102   return normalize_str (string, UNINORM_NFC);
2103 }
2104 #undef FUNC_NAME
2105
2106 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2107             (SCM string),
2108             "Returns the NFD normalized form of @var{string}.")
2109 #define FUNC_NAME s_scm_string_normalize_nfd
2110 {
2111   SCM_VALIDATE_STRING (1, string);
2112   return normalize_str (string, UNINORM_NFD);
2113 }
2114 #undef FUNC_NAME
2115
2116 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2117             (SCM string),
2118             "Returns the NFKC normalized form of @var{string}.")
2119 #define FUNC_NAME s_scm_string_normalize_nfkc
2120 {
2121   SCM_VALIDATE_STRING (1, string);
2122   return normalize_str (string, UNINORM_NFKC);
2123 }
2124 #undef FUNC_NAME
2125
2126 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2127             (SCM string),
2128             "Returns the NFKD normalized form of @var{string}.")
2129 #define FUNC_NAME s_scm_string_normalize_nfkd
2130 {
2131   SCM_VALIDATE_STRING (1, string);
2132   return normalize_str (string, UNINORM_NFKD);
2133 }
2134 #undef FUNC_NAME
2135
2136 /* converts C scm_array of strings to SCM scm_list of strings.
2137    If argc < 0, a null terminated scm_array is assumed.
2138    The current locale encoding is assumed */
2139 SCM
2140 scm_makfromstrs (int argc, char **argv)
2141 {
2142   int i = argc;
2143   SCM lst = SCM_EOL;
2144   if (0 > i)
2145     for (i = 0; argv[i]; i++);
2146   while (i--)
2147     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2148   return lst;
2149 }
2150
2151 /* Return a newly allocated array of char pointers to each of the strings
2152    in args, with a terminating NULL pointer.  The strings are encoded using
2153    the current locale. */
2154
2155 char **
2156 scm_i_allocate_string_pointers (SCM list)
2157 #define FUNC_NAME "scm_i_allocate_string_pointers"
2158 {
2159   char **result;
2160   int list_len = scm_ilength (list);
2161   int i;
2162
2163   if (list_len < 0)
2164     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2165
2166   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2167                           "string pointers");
2168   result[list_len] = NULL;
2169
2170   /* The list might have been modified in another thread, so
2171      we check LIST before each access.
2172    */
2173   for (i = 0; i < list_len && scm_is_pair (list); i++)
2174     {
2175       SCM str = SCM_CAR (list);
2176       size_t len;  /* String length in bytes */
2177       char *c_str = scm_to_locale_stringn (str, &len);
2178
2179       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2180          scm_malloc to allocate the returned string, which must be
2181          explicitly deallocated.  This forces us to copy the string a
2182          second time into a new buffer.  Ideally there would be variants
2183          of scm_to_*_stringn that can return garbage-collected buffers. */
2184
2185       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2186       memcpy (result[i], c_str, len);
2187       result[i][len] = '\0';
2188       free (c_str);
2189
2190       list = SCM_CDR (list);
2191     }
2192
2193   return result;
2194 }
2195 #undef FUNC_NAME
2196
2197 void
2198 scm_i_get_substring_spec (size_t len,
2199                           SCM start, size_t *cstart,
2200                           SCM end, size_t *cend)
2201 {
2202   if (SCM_UNBNDP (start))
2203     *cstart = 0;
2204   else
2205     *cstart = scm_to_unsigned_integer (start, 0, len);
2206
2207   if (SCM_UNBNDP (end))
2208     *cend = len;
2209   else
2210     *cend = scm_to_unsigned_integer (end, *cstart, len);
2211 }
2212
2213 static SCM
2214 string_handle_ref (scm_t_array_handle *h, size_t index)
2215 {
2216   return scm_c_string_ref (h->array, index);
2217 }
2218
2219 static void
2220 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2221 {
2222   scm_c_string_set_x (h->array, index, val);
2223 }
2224
2225 static void
2226 string_get_handle (SCM v, scm_t_array_handle *h)
2227 {
2228   h->array = v;
2229   h->ndims = 1;
2230   h->dims = &h->dim0;
2231   h->dim0.lbnd = 0;
2232   h->dim0.ubnd = scm_c_string_length (v) - 1;
2233   h->dim0.inc = 1;
2234   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2235   h->elements = h->writable_elements = NULL;
2236 }
2237
2238 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2239                           string_handle_ref, string_handle_set,
2240                           string_get_handle)
2241 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2242
2243 void
2244 scm_init_strings ()
2245 {
2246   scm_nullstr = scm_i_make_string (0, NULL, 1);
2247
2248 #include "libguile/strings.x"
2249 }
2250
2251
2252 /*
2253   Local Variables:
2254   c-file-style: "gnu"
2255   End:
2256 */