libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 229
 230 \f
 231 /* Copy-on-write strings.
 232  */
 233
 234 #define STRING_TAG            scm_tc7_string
 235
 236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 237 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 238 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 239
 240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 242
 243 #define IS_STRING(str)        (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
 244
 245 /* Read-only strings.
 246  */
 247
 248 #define RO_STRING_TAG         scm_tc7_ro_string
 249 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 250
 251 /* Mutation-sharing substrings
 252  */
 253
 254 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 255
 256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 257 /* START and LENGTH as for STRINGs. */
 258
 259 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 260
 261 SCM scm_nullstr;
 262
 263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 264    characters.  CHARSP, if not NULL, will be set to location of the
 265    char array.  If READ_ONLY_P, the returned string is read-only;
 266    otherwise it is writable.  */
 267 SCM
 268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 269 {
 270   SCM buf = make_stringbuf (len);
 271   SCM res;
 272   if (charsp)
 273     *charsp = (char *) STRINGBUF_CHARS (buf);
 274   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 275                          SCM_UNPACK (buf),
 276                          (scm_t_bits) 0, (scm_t_bits) len);
 277   return res;
 278 }
 279
 280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 281    characters.  CHARSP, if not NULL, will be set to location of the
 282    character array.  If READ_ONLY_P, the returned string is read-only;
 283    otherwise it is writable.  */
 284 SCM
 285 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 286 {
 287   SCM buf = make_wide_stringbuf (len);
 288   SCM res;
 289   if (charsp)
 290     *charsp = STRINGBUF_WIDE_CHARS (buf);
 291   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 292                          SCM_UNPACK (buf),
 293                          (scm_t_bits) 0, (scm_t_bits) len);
 294   return res;
 295 }
 296
 297 static void
 298 validate_substring_args (SCM str, size_t start, size_t end)
 299 {
 300   if (!IS_STRING (str))
 301     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 302   if (start > STRING_LENGTH (str))
 303     scm_out_of_range (NULL, scm_from_size_t (start));
 304   if (end > STRING_LENGTH (str) || end < start)
 305     scm_out_of_range (NULL, scm_from_size_t (end));
 306 }
 307
 308 static inline void
 309 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 310 {
 311   *start = STRING_START (*str);
 312   if (IS_SH_STRING (*str))
 313     {
 314       *str = SH_STRING_STRING (*str);
 315       *start += STRING_START (*str);
 316     }
 317   *buf = STRING_STRINGBUF (*str);
 318 }
 319
 320 SCM
 321 scm_i_substring (SCM str, size_t start, size_t end)
 322 {
 323   SCM buf;
 324   size_t str_start;
 325   get_str_buf_start (&str, &buf, &str_start);
 326   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 327   SET_STRINGBUF_SHARED (buf);
 328   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 329   return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 330                           (scm_t_bits)str_start + start,
 331                           (scm_t_bits) end - start);
 332 }
 333
 334 SCM
 335 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 336 {
 337   SCM buf;
 338   size_t str_start;
 339   get_str_buf_start (&str, &buf, &str_start);
 340   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 341   SET_STRINGBUF_SHARED (buf);
 342   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 343   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 344                           (scm_t_bits)str_start + start,
 345                           (scm_t_bits) end - start);
 346 }
 347
 348 SCM
 349 scm_i_substring_copy (SCM str, size_t start, size_t end)
 350 {
 351   size_t len = end - start;
 352   SCM buf, my_buf, substr;
 353   size_t str_start;
 354   int wide = 0;
 355   get_str_buf_start (&str, &buf, &str_start);
 356   if (scm_i_is_narrow_string (str))
 357     {
 358       my_buf = make_stringbuf (len);
 359       memcpy (STRINGBUF_CHARS (my_buf),
 360               STRINGBUF_CHARS (buf) + str_start + start, len);
 361     }
 362   else
 363     {
 364       my_buf = make_wide_stringbuf (len);
 365       u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 366                (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 367                                  + start), len);
 368       wide = 1;
 369     }
 370   scm_remember_upto_here_1 (buf);
 371   substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 372                             (scm_t_bits) 0, (scm_t_bits) len);
 373   if (wide)
 374     scm_i_try_narrow_string (substr);
 375   return substr;
 376 }
 377
 378 SCM
 379 scm_i_substring_shared (SCM str, size_t start, size_t end)
 380 {
 381   if (start == 0 && end == STRING_LENGTH (str))
 382     return str;
 383   else
 384     {
 385       size_t len = end - start;
 386       if (IS_SH_STRING (str))
 387         {
 388           start += STRING_START (str);
 389           str = SH_STRING_STRING (str);
 390         }
 391       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 392                               (scm_t_bits)start, (scm_t_bits) len);
 393     }
 394 }
 395
 396 SCM
 397 scm_c_substring (SCM str, size_t start, size_t end)
 398 {
 399   validate_substring_args (str, start, end);
 400   return scm_i_substring (str, start, end);
 401 }
 402
 403 SCM
 404 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 405 {
 406   validate_substring_args (str, start, end);
 407   return scm_i_substring_read_only (str, start, end);
 408 }
 409
 410 SCM
 411 scm_c_substring_copy (SCM str, size_t start, size_t end)
 412 {
 413   validate_substring_args (str, start, end);
 414   return scm_i_substring_copy (str, start, end);
 415 }
 416
 417 SCM
 418 scm_c_substring_shared (SCM str, size_t start, size_t end)
 419 {
 420   validate_substring_args (str, start, end);
 421   return scm_i_substring_shared (str, start, end);
 422 }
 423
 424 \f
 425 /* Internal accessors
 426  */
 427
 428 /* Returns the number of characters in STR.  This may be different
 429    than the memory size of the string storage.  */
 430 size_t
 431 scm_i_string_length (SCM str)
 432 {
 433   return STRING_LENGTH (str);
 434 }
 435
 436 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 437    encoding.  False if it is 'wide', having a 32-bit UCS-4
 438    encoding.  */
 439 int
 440 scm_i_is_narrow_string (SCM str)
 441 {
 442   if (IS_SH_STRING (str))
 443     str = SH_STRING_STRING (str);
 444
 445   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 446 }
 447
 448 /* Try to coerce a string to be narrow.  It if is narrow already, do
 449    nothing.  If it is wide, shrink it to narrow if none of its
 450    characters are above 0xFF.  Return true if the string is narrow or
 451    was made to be narrow.  */
 452 int
 453 scm_i_try_narrow_string (SCM str)
 454 {
 455   if (IS_SH_STRING (str))
 456     str = SH_STRING_STRING (str);
 457
 458   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 459
 460   return scm_i_is_narrow_string (str);
 461 }
 462
 463 /* Return a pointer to the raw data of the string, which can be either Latin-1
 464    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 465 const void *
 466 scm_i_string_data (SCM str)
 467 {
 468   SCM buf;
 469   size_t start;
 470   const char *data;
 471
 472   get_str_buf_start (&str, &buf, &start);
 473
 474   data = STRINGBUF_CONTENTS (buf);
 475   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 476
 477   return data;
 478 }
 479
 480 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 481    STR.  */
 482 const char *
 483 scm_i_string_chars (SCM str)
 484 {
 485   SCM buf;
 486   size_t start;
 487   get_str_buf_start (&str, &buf, &start);
 488   if (scm_i_is_narrow_string (str))
 489     return (const char *) STRINGBUF_CHARS (buf) + start;
 490   else
 491     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 492                     scm_list_1 (str));
 493   return NULL;
 494 }
 495
 496 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 497    STR.  */
 498 const scm_t_wchar *
 499 scm_i_string_wide_chars (SCM str)
 500 {
 501   SCM buf;
 502   size_t start;
 503
 504   get_str_buf_start (&str, &buf, &start);
 505   if (!scm_i_is_narrow_string (str))
 506     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 507   else
 508     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 509                     scm_list_1 (str));
 510 }
 511
 512 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 513    a new string buffer, so that it can be modified without modifying
 514    other strings.  Also, lock the string mutex.  Later, one must call
 515    scm_i_string_stop_writing to unlock the mutex.  */
 516 SCM
 517 scm_i_string_start_writing (SCM orig_str)
 518 {
 519   SCM buf, str = orig_str;
 520   size_t start;
 521
 522   get_str_buf_start (&str, &buf, &start);
 523   if (IS_RO_STRING (str))
 524     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 525
 526   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 527   if (STRINGBUF_SHARED (buf))
 528     {
 529       /* Clone the stringbuf.  */
 530       size_t len = STRING_LENGTH (str);
 531       SCM new_buf;
 532
 533       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 534
 535       if (scm_i_is_narrow_string (str))
 536         {
 537           new_buf = make_stringbuf (len);
 538           memcpy (STRINGBUF_CHARS (new_buf),
 539                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 540
 541         }
 542       else
 543         {
 544           new_buf = make_wide_stringbuf (len);
 545           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 546                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 547                                      + STRING_START (str)), len);
 548         }
 549
 550       SET_STRING_STRINGBUF (str, new_buf);
 551       start -= STRING_START (str);
 552
 553       /* FIXME: The following operations are not atomic, so other threads
 554          looking at STR may see an inconsistent state.  Nevertheless it can't
 555          hurt much since (i) accessing STR while it is being mutated can't
 556          yield a crash, and (ii) concurrent accesses to STR should be
 557          protected by a mutex at the application level.  The latter may not
 558          apply when STR != ORIG_STR, though.  */
 559       SET_STRING_START (str, 0);
 560       SET_STRING_STRINGBUF (str, new_buf);
 561
 562       buf = new_buf;
 563
 564       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 565     }
 566   return orig_str;
 567 }
 568
 569 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 570 char *
 571 scm_i_string_writable_chars (SCM str)
 572 {
 573   SCM buf;
 574   size_t start;
 575
 576   get_str_buf_start (&str, &buf, &start);
 577   if (scm_i_is_narrow_string (str))
 578     return (char *) STRINGBUF_CHARS (buf) + start;
 579   else
 580     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 581                     scm_list_1 (str));
 582   return NULL;
 583 }
 584
 585 /* Return a pointer to the UCS-4 codepoints of a string.  */
 586 static scm_t_wchar *
 587 scm_i_string_writable_wide_chars (SCM str)
 588 {
 589   SCM buf;
 590   size_t start;
 591
 592   get_str_buf_start (&str, &buf, &start);
 593   if (!scm_i_is_narrow_string (str))
 594     return STRINGBUF_WIDE_CHARS (buf) + start;
 595   else
 596     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 597                     scm_list_1 (str));
 598 }
 599
 600 /* Unlock the string mutex that was locked when
 601    scm_i_string_start_writing was called.  */
 602 void
 603 scm_i_string_stop_writing (void)
 604 {
 605   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 606 }
 607
 608 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 609 scm_t_wchar
 610 scm_i_string_ref (SCM str, size_t x)
 611 {
 612   if (scm_i_is_narrow_string (str))
 613     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 614   else
 615     return scm_i_string_wide_chars (str)[x];
 616 }
 617
 618 /* Returns index+1 of the first char in STR that matches C, or
 619    0 if the char is not found.  */
 620 int
 621 scm_i_string_contains_char (SCM str, char ch)
 622 {
 623   size_t i;
 624   size_t len = scm_i_string_length (str);
 625
 626   i = 0;
 627   if (scm_i_is_narrow_string (str))
 628     {
 629       while (i < len)
 630         {
 631           if (scm_i_string_chars (str)[i] == ch)
 632             return i+1;
 633           i++;
 634         }
 635     }
 636   else
 637     {
 638       while (i < len)
 639         {
 640           if (scm_i_string_wide_chars (str)[i]
 641               == (unsigned char) ch)
 642             return i+1;
 643           i++;
 644         }
 645     }
 646   return 0;
 647 }
 648
 649 int
 650 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 651 {
 652   if (scm_i_is_narrow_string (sstr))
 653     {
 654       const char *a = scm_i_string_chars (sstr) + start_x;
 655       const char *b = cstr;
 656       return strncmp (a, b, strlen(b));
 657     }
 658   else
 659     {
 660       size_t i;
 661       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 662       const char *b = cstr;
 663       for (i = 0; i < strlen (b); i++)
 664         {
 665           if (a[i] != (unsigned char) b[i])
 666             return 1;
 667         }
 668     }
 669   return 0;
 670 }
 671
 672 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 673 void
 674 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 675 {
 676   if (IS_SH_STRING (str))
 677     {
 678       p += STRING_START (str);
 679       str = SH_STRING_STRING (str);
 680     }
 681
 682   if (chr > 0xFF && scm_i_is_narrow_string (str))
 683     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 684
 685   if (scm_i_is_narrow_string (str))
 686     {
 687       char *dst = scm_i_string_writable_chars (str);
 688       dst[p] = chr;
 689     }
 690   else
 691     {
 692       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 693       dst[p] = chr;
 694     }
 695 }
 696
 697 \f
 698 /* Symbols.
 699
 700    Basic symbol creation and accessing is done here, the rest is in
 701    symbols.[hc].  This has been done to keep stringbufs and the
 702    internals of strings and string-like objects confined to this file.
 703 */
 704
 705 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 706
 707 SCM
 708 scm_i_make_symbol (SCM name, scm_t_bits flags,
 709                    unsigned long hash, SCM props)
 710 {
 711   SCM buf;
 712   size_t start = STRING_START (name);
 713   size_t length = STRING_LENGTH (name);
 714
 715   if (IS_SH_STRING (name))
 716     {
 717       name = SH_STRING_STRING (name);
 718       start += STRING_START (name);
 719     }
 720   buf = SYMBOL_STRINGBUF (name);
 721
 722   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 723     {
 724       /* reuse buf. */
 725       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 726       SET_STRINGBUF_SHARED (buf);
 727       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 728     }
 729   else
 730     {
 731       /* make new buf. */
 732       if (scm_i_is_narrow_string (name))
 733         {
 734           SCM new_buf = make_stringbuf (length);
 735           memcpy (STRINGBUF_CHARS (new_buf),
 736                   STRINGBUF_CHARS (buf) + start, length);
 737           buf = new_buf;
 738         }
 739       else
 740         {
 741           SCM new_buf = make_wide_stringbuf (length);
 742           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 743                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 744                    length);
 745           buf = new_buf;
 746         }
 747     }
 748   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 749                           (scm_t_bits) hash, SCM_UNPACK (props));
 750 }
 751
 752 SCM
 753 scm_i_c_make_symbol (const char *name, size_t len,
 754                      scm_t_bits flags, unsigned long hash, SCM props)
 755 {
 756   SCM buf = make_stringbuf (len);
 757   memcpy (STRINGBUF_CHARS (buf), name, len);
 758
 759   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 760                           (scm_t_bits) hash, SCM_UNPACK (props));
 761 }
 762
 763 /* Returns the number of characters in SYM.  This may be different
 764    from the memory size of SYM.  */
 765 size_t
 766 scm_i_symbol_length (SCM sym)
 767 {
 768   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 769 }
 770
 771 size_t
 772 scm_c_symbol_length (SCM sym)
 773 #define FUNC_NAME "scm_c_symbol_length"
 774 {
 775   SCM_VALIDATE_SYMBOL (1, sym);
 776
 777   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 778 }
 779 #undef FUNC_NAME
 780
 781 /* True if the name of SYM is stored as a Latin-1 encoded string.
 782    False if it is stored as a 32-bit UCS-4-encoded string.  */
 783 int
 784 scm_i_is_narrow_symbol (SCM sym)
 785 {
 786   SCM buf;
 787
 788   buf = SYMBOL_STRINGBUF (sym);
 789   return !STRINGBUF_WIDE (buf);
 790 }
 791
 792 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 793    contains the name of SYM.  */
 794 const char *
 795 scm_i_symbol_chars (SCM sym)
 796 {
 797   SCM buf;
 798
 799   buf = SYMBOL_STRINGBUF (sym);
 800   if (!STRINGBUF_WIDE (buf))
 801     return (const char *) STRINGBUF_CHARS (buf);
 802   else
 803     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 804                     scm_list_1 (sym));
 805 }
 806
 807 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 808    symbol's name.  */
 809 const scm_t_wchar *
 810 scm_i_symbol_wide_chars (SCM sym)
 811 {
 812   SCM buf;
 813
 814   buf = SYMBOL_STRINGBUF (sym);
 815   if (STRINGBUF_WIDE (buf))
 816     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 817   else
 818     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 819                     scm_list_1 (sym));
 820 }
 821
 822 SCM
 823 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 824 {
 825   SCM buf = SYMBOL_STRINGBUF (sym);
 826   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 827   SET_STRINGBUF_SHARED (buf);
 828   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 829   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 830                           (scm_t_bits)start, (scm_t_bits) end - start);
 831 }
 832
 833 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 834 scm_t_wchar
 835 scm_i_symbol_ref (SCM sym, size_t x)
 836 {
 837   if (scm_i_is_narrow_symbol (sym))
 838     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 839   else
 840     return scm_i_symbol_wide_chars (sym)[x];
 841 }
 842
 843 /* Debugging
 844  */
 845
 846 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 847             "Returns an association list containing debugging information\n"
 848             "for @var{str}. The association list has the following entries."
 849             "@table @code\n"
 850             "@item string\n"
 851             "The string itself.\n"
 852             "@item start\n"
 853             "The start index of the string into its stringbuf\n"
 854             "@item length\n"
 855             "The length of the string\n"
 856             "@item shared\n"
 857             "If this string is a substring, it returns its parent string.\n"
 858             "Otherwise, it returns @code{#f}\n"
 859             "@item read-only\n"
 860             "@code{#t} if the string is read-only\n"
 861             "@item stringbuf-chars\n"
 862             "A new string containing this string's stringbuf's characters\n"
 863             "@item stringbuf-length\n"
 864             "The number of characters in this stringbuf\n"
 865             "@item stringbuf-shared\n"
 866             "@code{#t} if this stringbuf is shared\n"
 867             "@item stringbuf-wide\n"
 868             "@code{#t} if this stringbuf's characters are stored in a\n"
 869             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 870             "buffer\n"
 871             "@end table")
 872 #define FUNC_NAME s_scm_sys_string_dump
 873 {
 874   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 875   SCM buf;
 876   SCM_VALIDATE_STRING (1, str);
 877
 878   /* String info */
 879   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 880                  str);
 881   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 882                  scm_from_size_t (STRING_START (str)));
 883   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 884                  scm_from_size_t (STRING_LENGTH (str)));
 885
 886   if (IS_SH_STRING (str))
 887     {
 888       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 889                      SH_STRING_STRING (str));
 890       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 891     }
 892   else
 893     {
 894       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 895                      SCM_BOOL_F);
 896       buf = STRING_STRINGBUF (str);
 897     }
 898
 899   if (IS_RO_STRING (str))
 900     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 901                    SCM_BOOL_T);
 902   else
 903     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 904                    SCM_BOOL_F);
 905
 906   /* Stringbuf info */
 907   if (!STRINGBUF_WIDE (buf))
 908     {
 909       size_t len = STRINGBUF_LENGTH (buf);
 910       char *cbuf;
 911       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 912       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 913       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 914                      sbc);
 915     }
 916   else
 917     {
 918       size_t len = STRINGBUF_LENGTH (buf);
 919       scm_t_wchar *cbuf;
 920       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 921       u32_cpy ((scm_t_uint32 *) cbuf,
 922                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 923       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 924                      sbc);
 925     }
 926   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 927                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 928   if (STRINGBUF_SHARED (buf))
 929     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 930                    SCM_BOOL_T);
 931   else
 932     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 933                    SCM_BOOL_F);
 934   if (STRINGBUF_WIDE (buf))
 935     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 936                    SCM_BOOL_T);
 937   else
 938     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 939                    SCM_BOOL_F);
 940
 941   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 942 }
 943 #undef FUNC_NAME
 944
 945 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 946             "Returns an association list containing debugging information\n"
 947             "for @var{sym}. The association list has the following entries."
 948             "@table @code\n"
 949             "@item symbol\n"
 950             "The symbol itself\n"
 951             "@item hash\n"
 952             "Its hash value\n"
 953             "@item interned\n"
 954             "@code{#t} if it is an interned symbol\n"
 955             "@item stringbuf-chars\n"
 956             "A new string containing this symbols's stringbuf's characters\n"
 957             "@item stringbuf-length\n"
 958             "The number of characters in this stringbuf\n"
 959             "@item stringbuf-shared\n"
 960             "@code{#t} if this stringbuf is shared\n"
 961             "@item stringbuf-wide\n"
 962             "@code{#t} if this stringbuf's characters are stored in a\n"
 963             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 964             "buffer\n"
 965             "@end table")
 966 #define FUNC_NAME s_scm_sys_symbol_dump
 967 {
 968   SCM e1, e2, e3, e4, e5, e6, e7;
 969   SCM buf;
 970   SCM_VALIDATE_SYMBOL (1, sym);
 971   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
 972                  sym);
 973   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
 974                  scm_from_ulong (scm_i_symbol_hash (sym)));
 975   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
 976                  scm_symbol_interned_p (sym));
 977   buf = SYMBOL_STRINGBUF (sym);
 978
 979   /* Stringbuf info */
 980   if (!STRINGBUF_WIDE (buf))
 981     {
 982       size_t len = STRINGBUF_LENGTH (buf);
 983       char *cbuf;
 984       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 985       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 986       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 987                      sbc);
 988     }
 989   else
 990     {
 991       size_t len = STRINGBUF_LENGTH (buf);
 992       scm_t_wchar *cbuf;
 993       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 994       u32_cpy ((scm_t_uint32 *) cbuf,
 995                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 996       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 997                      sbc);
 998     }
 999   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1000                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1001   if (STRINGBUF_SHARED (buf))
1002     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1003                    SCM_BOOL_T);
1004   else
1005     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1006                    SCM_BOOL_F);
1007   if (STRINGBUF_WIDE (buf))
1008     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1009                     SCM_BOOL_T);
1010   else
1011     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1012                     SCM_BOOL_F);
1013   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1014
1015 }
1016 #undef FUNC_NAME
1017
1018 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1019
1020 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1021 #define FUNC_NAME s_scm_sys_stringbuf_hist
1022 {
1023   int i;
1024   for (i = 0; i < 1000; i++)
1025     if (lenhist[i])
1026       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1027   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1028   return SCM_UNSPECIFIED;
1029 }
1030 #undef FUNC_NAME
1031
1032 #endif
1033
1034 \f
1035
1036 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1037             (SCM obj),
1038             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1039 #define FUNC_NAME s_scm_string_p
1040 {
1041   return scm_from_bool (IS_STRING (obj));
1042 }
1043 #undef FUNC_NAME
1044
1045
1046 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1047
1048 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1049             (SCM chrs),
1050             "@deffnx {Scheme Procedure} list->string chrs\n"
1051             "Return a newly allocated string composed of the arguments,\n"
1052             "@var{chrs}.")
1053 #define FUNC_NAME s_scm_string
1054 {
1055   SCM result = SCM_BOOL_F;
1056   SCM rest;
1057   size_t len;
1058   size_t p = 0;
1059   long i;
1060   int wide = 0;
1061
1062   /* Verify that this is a list of chars.  */
1063   i = scm_ilength (chrs);
1064   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1065
1066   len = (size_t) i;
1067   rest = chrs;
1068
1069   while (len > 0 && scm_is_pair (rest))
1070     {
1071       SCM elt = SCM_CAR (rest);
1072       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1073       if (SCM_CHAR (elt) > 0xFF)
1074         wide = 1;
1075       rest = SCM_CDR (rest);
1076       len--;
1077       scm_remember_upto_here_1 (elt);
1078     }
1079
1080   /* Construct a string containing this list of chars.  */
1081   len = (size_t) i;
1082   rest = chrs;
1083
1084   if (wide == 0)
1085     {
1086       char *buf;
1087
1088       result = scm_i_make_string (len, NULL, 0);
1089       result = scm_i_string_start_writing (result);
1090       buf = scm_i_string_writable_chars (result);
1091       while (len > 0 && scm_is_pair (rest))
1092         {
1093           SCM elt = SCM_CAR (rest);
1094           buf[p] = (unsigned char) SCM_CHAR (elt);
1095           p++;
1096           rest = SCM_CDR (rest);
1097           len--;
1098           scm_remember_upto_here_1 (elt);
1099         }
1100     }
1101   else
1102     {
1103       scm_t_wchar *buf;
1104
1105       result = scm_i_make_wide_string (len, NULL, 0);
1106       result = scm_i_string_start_writing (result);
1107       buf = scm_i_string_writable_wide_chars (result);
1108       while (len > 0 && scm_is_pair (rest))
1109         {
1110           SCM elt = SCM_CAR (rest);
1111           buf[p] = SCM_CHAR (elt);
1112           p++;
1113           rest = SCM_CDR (rest);
1114           len--;
1115           scm_remember_upto_here_1 (elt);
1116         }
1117     }
1118   scm_i_string_stop_writing ();
1119
1120   if (len > 0)
1121     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1122   if (!scm_is_null (rest))
1123     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1124
1125   return result;
1126 }
1127 #undef FUNC_NAME
1128
1129 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1130             (SCM k, SCM chr),
1131             "Return a newly allocated string of\n"
1132             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1133             "the string are initialized to @var{chr}, otherwise the contents\n"
1134             "of the @var{string} are all set to @var{#\nul}.")
1135 #define FUNC_NAME s_scm_make_string
1136 {
1137   return scm_c_make_string (scm_to_size_t (k), chr);
1138 }
1139 #undef FUNC_NAME
1140
1141 SCM
1142 scm_c_make_string (size_t len, SCM chr)
1143 #define FUNC_NAME NULL
1144 {
1145   size_t p;
1146   char *contents = NULL;
1147   SCM res = scm_i_make_string (len, &contents, 0);
1148
1149   /* If no char is given, initialize string contents to NULL.  */
1150   if (SCM_UNBNDP (chr))
1151     memset (contents, 0, len);
1152   else
1153     {
1154       SCM_VALIDATE_CHAR (0, chr);
1155       res = scm_i_string_start_writing (res);
1156       for (p = 0; p < len; p++)
1157         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1158       scm_i_string_stop_writing ();
1159     }
1160
1161   return res;
1162 }
1163 #undef FUNC_NAME
1164
1165 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1166             (SCM string),
1167             "Return the number of characters in @var{string}.")
1168 #define FUNC_NAME s_scm_string_length
1169 {
1170   SCM_VALIDATE_STRING (1, string);
1171   return scm_from_size_t (STRING_LENGTH (string));
1172 }
1173 #undef FUNC_NAME
1174
1175 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1176             (SCM string),
1177             "Return the bytes used to represent a character in @var{string}."
1178             "This will return 1 or 4.")
1179 #define FUNC_NAME s_scm_string_bytes_per_char
1180 {
1181   SCM_VALIDATE_STRING (1, string);
1182   if (!scm_i_is_narrow_string (string))
1183     return scm_from_int (4);
1184
1185   return scm_from_int (1);
1186 }
1187 #undef FUNC_NAME
1188
1189 size_t
1190 scm_c_string_length (SCM string)
1191 {
1192   if (!IS_STRING (string))
1193     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1194   return STRING_LENGTH (string);
1195 }
1196
1197 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1198             (SCM str, SCM k),
1199             "Return character @var{k} of @var{str} using zero-origin\n"
1200             "indexing. @var{k} must be a valid index of @var{str}.")
1201 #define FUNC_NAME s_scm_string_ref
1202 {
1203   size_t len;
1204   unsigned long idx;
1205
1206   SCM_VALIDATE_STRING (1, str);
1207
1208   len = scm_i_string_length (str);
1209   if (SCM_LIKELY (len > 0))
1210     idx = scm_to_unsigned_integer (k, 0, len - 1);
1211   else
1212     scm_out_of_range (NULL, k);
1213
1214   if (scm_i_is_narrow_string (str))
1215     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1216   else
1217     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1218 }
1219 #undef FUNC_NAME
1220
1221 SCM
1222 scm_c_string_ref (SCM str, size_t p)
1223 {
1224   if (p >= scm_i_string_length (str))
1225     scm_out_of_range (NULL, scm_from_size_t (p));
1226   if (scm_i_is_narrow_string (str))
1227     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1228   else
1229     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1230
1231 }
1232
1233 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1234             (SCM str, SCM k, SCM chr),
1235             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1236             "an unspecified value. @var{k} must be a valid index of\n"
1237             "@var{str}.")
1238 #define FUNC_NAME s_scm_string_set_x
1239 {
1240   size_t len;
1241   unsigned long idx;
1242
1243   SCM_VALIDATE_STRING (1, str);
1244
1245   len = scm_i_string_length (str);
1246   if (SCM_LIKELY (len > 0))
1247     idx = scm_to_unsigned_integer (k, 0, len - 1);
1248   else
1249     scm_out_of_range (NULL, k);
1250
1251   SCM_VALIDATE_CHAR (3, chr);
1252   str = scm_i_string_start_writing (str);
1253   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1254   scm_i_string_stop_writing ();
1255
1256   return SCM_UNSPECIFIED;
1257 }
1258 #undef FUNC_NAME
1259
1260 void
1261 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1262 {
1263   if (p >= scm_i_string_length (str))
1264     scm_out_of_range (NULL, scm_from_size_t (p));
1265   str = scm_i_string_start_writing (str);
1266   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1267   scm_i_string_stop_writing ();
1268 }
1269
1270 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1271             (SCM str, SCM start, SCM end),
1272             "Return a newly allocated string formed from the characters\n"
1273             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1274             "ending with index @var{end} (exclusive).\n"
1275             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1276             "exact integers satisfying:\n\n"
1277             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1278 #define FUNC_NAME s_scm_substring
1279 {
1280   size_t len, from, to;
1281
1282   SCM_VALIDATE_STRING (1, str);
1283   len = scm_i_string_length (str);
1284   from = scm_to_unsigned_integer (start, 0, len);
1285   if (SCM_UNBNDP (end))
1286     to = len;
1287   else
1288     to = scm_to_unsigned_integer (end, from, len);
1289   return scm_i_substring (str, from, to);
1290 }
1291 #undef FUNC_NAME
1292
1293 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1294             (SCM str, SCM start, SCM end),
1295             "Return a newly allocated string formed from the characters\n"
1296             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1297             "ending with index @var{end} (exclusive).\n"
1298             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1299             "exact integers satisfying:\n"
1300             "\n"
1301             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1302             "\n"
1303             "The returned string is read-only.\n")
1304 #define FUNC_NAME s_scm_substring_read_only
1305 {
1306   size_t len, from, to;
1307
1308   SCM_VALIDATE_STRING (1, str);
1309   len = scm_i_string_length (str);
1310   from = scm_to_unsigned_integer (start, 0, len);
1311   if (SCM_UNBNDP (end))
1312     to = len;
1313   else
1314     to = scm_to_unsigned_integer (end, from, len);
1315   return scm_i_substring_read_only (str, from, to);
1316 }
1317 #undef FUNC_NAME
1318
1319 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1320             (SCM str, SCM start, SCM end),
1321             "Return a newly allocated string formed from the characters\n"
1322             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323             "ending with index @var{end} (exclusive).\n"
1324             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325             "exact integers satisfying:\n\n"
1326             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1327 #define FUNC_NAME s_scm_substring_copy
1328 {
1329   /* For the Scheme version, START is mandatory, but for the C
1330      version, it is optional.  See scm_string_copy in srfi-13.c for a
1331      rationale.
1332   */
1333
1334   size_t from, to;
1335
1336   SCM_VALIDATE_STRING (1, str);
1337   scm_i_get_substring_spec (scm_i_string_length (str),
1338                             start, &from, end, &to);
1339   return scm_i_substring_copy (str, from, to);
1340 }
1341 #undef FUNC_NAME
1342
1343 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1344             (SCM str, SCM start, SCM end),
1345             "Return string that indirectly refers to the characters\n"
1346             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1347             "ending with index @var{end} (exclusive).\n"
1348             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1349             "exact integers satisfying:\n\n"
1350             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1351 #define FUNC_NAME s_scm_substring_shared
1352 {
1353   size_t len, from, to;
1354
1355   SCM_VALIDATE_STRING (1, str);
1356   len = scm_i_string_length (str);
1357   from = scm_to_unsigned_integer (start, 0, len);
1358   if (SCM_UNBNDP (end))
1359     to = len;
1360   else
1361     to = scm_to_unsigned_integer (end, from, len);
1362   return scm_i_substring_shared (str, from, to);
1363 }
1364 #undef FUNC_NAME
1365
1366 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1367             (SCM args),
1368             "Return a newly allocated string whose characters form the\n"
1369             "concatenation of the given strings, @var{args}.")
1370 #define FUNC_NAME s_scm_string_append
1371 {
1372   SCM res;
1373   size_t len = 0;
1374   int wide = 0;
1375   SCM l, s;
1376   size_t i;
1377   union
1378   {
1379     char *narrow;
1380     scm_t_wchar *wide;
1381   } data;
1382
1383   SCM_VALIDATE_REST_ARGUMENT (args);
1384   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1385     {
1386       s = SCM_CAR (l);
1387       SCM_VALIDATE_STRING (SCM_ARGn, s);
1388       len += scm_i_string_length (s);
1389       if (!scm_i_is_narrow_string (s))
1390         wide = 1;
1391     }
1392   data.narrow = NULL;
1393   if (!wide)
1394     res = scm_i_make_string (len, &data.narrow, 0);
1395   else
1396     res = scm_i_make_wide_string (len, &data.wide, 0);
1397
1398   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1399     {
1400       size_t len;
1401       s = SCM_CAR (l);
1402       SCM_VALIDATE_STRING (SCM_ARGn, s);
1403       len = scm_i_string_length (s);
1404       if (!wide)
1405         {
1406           memcpy (data.narrow, scm_i_string_chars (s), len);
1407           data.narrow += len;
1408         }
1409       else
1410         {
1411           if (scm_i_is_narrow_string (s))
1412             {
1413               for (i = 0; i < scm_i_string_length (s); i++)
1414                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1415             }
1416           else
1417             u32_cpy ((scm_t_uint32 *) data.wide,
1418                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1419           data.wide += len;
1420         }
1421       scm_remember_upto_here_1 (s);
1422     }
1423   return res;
1424 }
1425 #undef FUNC_NAME
1426
1427
1428 \f
1429 /* Charset conversion error handling.  */
1430
1431 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1432 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1433
1434 /* Raise an exception informing that character CHR could not be written
1435    to PORT in its current encoding.  */
1436 void
1437 scm_encoding_error (const char *subr, int err, const char *message,
1438                     SCM port, SCM chr)
1439 {
1440   scm_throw (scm_encoding_error_key,
1441              scm_list_n (scm_from_latin1_string (subr),
1442                          scm_from_latin1_string (message),
1443                          scm_from_int (err),
1444                          port, chr,
1445                          SCM_UNDEFINED));
1446 }
1447
1448 /* Raise an exception informing of an encoding error on PORT.  This
1449    means that a character could not be written in PORT's encoding.  */
1450 void
1451 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1452 {
1453   scm_throw (scm_decoding_error_key,
1454              scm_list_n (scm_from_latin1_string (subr),
1455                          scm_from_latin1_string (message),
1456                          scm_from_int (err),
1457                          port,
1458                          SCM_UNDEFINED));
1459 }
1460
1461 \f
1462 /* String conversion to/from C.  */
1463
1464 SCM
1465 scm_from_stringn (const char *str, size_t len, const char *encoding,
1466                   scm_t_string_failed_conversion_handler handler)
1467 {
1468   size_t u32len, i;
1469   scm_t_wchar *u32;
1470   int wide = 0;
1471   SCM res;
1472
1473   /* The order of these checks is important. */
1474   if (!str && len != 0)
1475     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1476   if (len == (size_t) -1)
1477     len = strlen (str);
1478
1479   if (encoding == NULL || len == 0)
1480     {
1481       /* If encoding is null (or the string is empty), use Latin-1.  */
1482       char *buf;
1483       res = scm_i_make_string (len, &buf, 0);
1484       memcpy (buf, str, len);
1485       return res;
1486     }
1487
1488   u32len = 0;
1489   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1490                                                 (enum iconv_ilseq_handler)
1491                                                 handler,
1492                                                 str, len,
1493                                                 NULL,
1494                                                 NULL, &u32len);
1495
1496   if (SCM_UNLIKELY (u32 == NULL))
1497     {
1498       /* Raise an error and pass the raw C string as a bytevector to the `throw'
1499          handler.  */
1500       SCM bv;
1501       signed char *buf;
1502
1503       buf = scm_gc_malloc_pointerless (len, "bytevector");
1504       memcpy (buf, str, len);
1505       bv = scm_c_take_gc_bytevector (buf, len);
1506
1507       scm_decoding_error (__func__, errno,
1508                           "input locale conversion error", bv);
1509     }
1510
1511   i = 0;
1512   while (i < u32len)
1513     if (u32[i++] > 0xFF)
1514       {
1515         wide = 1;
1516         break;
1517       }
1518
1519   if (!wide)
1520     {
1521       char *dst;
1522       res = scm_i_make_string (u32len, &dst, 0);
1523       for (i = 0; i < u32len; i ++)
1524         dst[i] = (unsigned char) u32[i];
1525       dst[u32len] = '\0';
1526     }
1527   else
1528     {
1529       scm_t_wchar *wdst;
1530       res = scm_i_make_wide_string (u32len, &wdst, 0);
1531       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1532       wdst[u32len] = 0;
1533     }
1534
1535   free (u32);
1536   return res;
1537 }
1538
1539 SCM
1540 scm_from_locale_string (const char *str)
1541 {
1542   return scm_from_locale_stringn (str, -1);
1543 }
1544
1545 SCM
1546 scm_from_locale_stringn (const char *str, size_t len)
1547 {
1548   return scm_from_stringn (str, len, locale_charset (),
1549                            scm_i_get_conversion_strategy (SCM_BOOL_F));
1550 }
1551
1552 SCM
1553 scm_from_latin1_string (const char *str)
1554 {
1555   return scm_from_latin1_stringn (str, -1);
1556 }
1557
1558 SCM
1559 scm_from_latin1_stringn (const char *str, size_t len)
1560 {
1561   char *buf;
1562   SCM result;
1563
1564   if (len == (size_t) -1)
1565     len = strlen (str);
1566
1567   /* Make a narrow string and copy STR as is.  */
1568   result = scm_i_make_string (len, &buf, 0);
1569   memcpy (buf, str, len);
1570
1571   return result;
1572 }
1573
1574 SCM
1575 scm_from_utf8_string (const char *str)
1576 {
1577   return scm_from_utf8_stringn (str, -1);
1578 }
1579
1580 SCM
1581 scm_from_utf8_stringn (const char *str, size_t len)
1582 {
1583   return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1584 }
1585
1586 SCM
1587 scm_from_utf32_string (const scm_t_wchar *str)
1588 {
1589   return scm_from_utf32_stringn (str, -1);
1590 }
1591
1592 SCM
1593 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1594 {
1595   SCM result;
1596   scm_t_wchar *buf;
1597
1598   if (len == (size_t) -1)
1599     len = u32_strlen ((uint32_t *) str);
1600
1601   result = scm_i_make_wide_string (len, &buf, 0);
1602   memcpy (buf, str, len * sizeof (scm_t_wchar));
1603   scm_i_try_narrow_string (result);
1604
1605   return result;
1606 }
1607
1608 /* Create a new scheme string from the C string STR.  The memory of
1609    STR may be used directly as storage for the new string.  */
1610 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1611    would be to register a finalizer to eventually free(3) STR, which isn't
1612    worth it.  Should we just deprecate the `scm_take_' functions?  */
1613 SCM
1614 scm_take_locale_stringn (char *str, size_t len)
1615 {
1616   SCM res;
1617
1618   res = scm_from_locale_stringn (str, len);
1619   free (str);
1620
1621   return res;
1622 }
1623
1624 SCM
1625 scm_take_locale_string (char *str)
1626 {
1627   return scm_take_locale_stringn (str, -1);
1628 }
1629
1630 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1631    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1632    Set *LENP to the size of the resulting string.
1633
1634    FIXME: This is a hack we should get rid of.  See
1635    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1636    for details.  */
1637 static void
1638 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1639 {
1640   char *before, *after;
1641   size_t i, j;
1642
1643   before = buf;
1644   after = buf;
1645   i = 0;
1646   j = 0;
1647   while (i < *lenp)
1648     {
1649       if ((i <= *lenp - 6)
1650           && before[i] == '\\'
1651           && before[i + 1] == 'u'
1652           && before[i + 2] == '0' && before[i + 3] == '0')
1653         {
1654           /* Convert \u00NN to \xNN */
1655           after[j] = '\\';
1656           after[j + 1] = 'x';
1657           after[j + 2] = tolower ((int) before[i + 4]);
1658           after[j + 3] = tolower ((int) before[i + 5]);
1659           i += 6;
1660           j += 4;
1661         }
1662       else if ((i <= *lenp - 10)
1663                && before[i] == '\\'
1664                && before[i + 1] == 'U'
1665                && before[i + 2] == '0' && before[i + 3] == '0')
1666         {
1667           /* Convert \U00NNNNNN to \UNNNNNN */
1668           after[j] = '\\';
1669           after[j + 1] = 'U';
1670           after[j + 2] = tolower ((int) before[i + 4]);
1671           after[j + 3] = tolower ((int) before[i + 5]);
1672           after[j + 4] = tolower ((int) before[i + 6]);
1673           after[j + 5] = tolower ((int) before[i + 7]);
1674           after[j + 6] = tolower ((int) before[i + 8]);
1675           after[j + 7] = tolower ((int) before[i + 9]);
1676           i += 10;
1677           j += 8;
1678         }
1679       else
1680         {
1681           after[j] = before[i];
1682           i++;
1683           j++;
1684         }
1685     }
1686   *lenp = j;
1687 }
1688
1689 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1690    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1691    of the resulting string.  BUF must be large enough to handle the
1692    worst case when `\uXXXX' escapes (6 characters) are replaced by
1693    `\xXXXX;' (7 characters).  */
1694 static void
1695 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1696 {
1697   char *before, *after;
1698   size_t i, j;
1699   /* The worst case is if the input string contains all 4-digit hex escapes.
1700      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1701   size_t max_out_len = (*lenp * 7) / 6 + 1;
1702   size_t nzeros, ndigits;
1703
1704   before = buf;
1705   after = alloca (max_out_len);
1706   i = 0;
1707   j = 0;
1708   while (i < *lenp)
1709     {
1710       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1711           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1712         {
1713           if (before[i + 1] == 'u')
1714             ndigits = 4;
1715           else if (before[i + 1] == 'U')
1716             ndigits = 8;
1717           else
1718             abort ();
1719
1720           /* Add the R6RS hex escape initial sequence.  */
1721           after[j] = '\\';
1722           after[j + 1] = 'x';
1723
1724           /* Move string positions to the start of the hex numbers.  */
1725           i += 2;
1726           j += 2;
1727
1728           /* Find the number of initial zeros in this hex number.  */
1729           nzeros = 0;
1730           while (before[i + nzeros] == '0' && nzeros < ndigits)
1731             nzeros++;
1732
1733           /* Copy the number, skipping initial zeros, and then move the string
1734              positions.  */
1735           if (nzeros == ndigits)
1736             {
1737               after[j] = '0';
1738               i += ndigits;
1739               j += 1;
1740             }
1741           else
1742             {
1743               int pos;
1744               for (pos = 0; pos < ndigits - nzeros; pos++)
1745                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1746               i += ndigits;
1747               j += (ndigits - nzeros);
1748             }
1749
1750           /* Add terminating semicolon.  */
1751           after[j] = ';';
1752           j++;
1753         }
1754       else
1755         {
1756           after[j] = before[i];
1757           i++;
1758           j++;
1759         }
1760     }
1761   *lenp = j;
1762   memcpy (before, after, j);
1763 }
1764
1765 char *
1766 scm_to_locale_string (SCM str)
1767 {
1768   return scm_to_locale_stringn (str, NULL);
1769 }
1770
1771 char *
1772 scm_to_locale_stringn (SCM str, size_t *lenp)
1773 {
1774   return scm_to_stringn (str, lenp,
1775                          locale_charset (),
1776                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1777 }
1778
1779 char *
1780 scm_to_latin1_string (SCM str)
1781 {
1782   return scm_to_latin1_stringn (str, NULL);
1783 }
1784
1785 char *
1786 scm_to_latin1_stringn (SCM str, size_t *lenp)
1787 #define FUNC_NAME "scm_to_latin1_stringn"
1788 {
1789   char *result;
1790
1791   SCM_VALIDATE_STRING (1, str);
1792
1793   if (scm_i_is_narrow_string (str))
1794     {
1795       size_t len = scm_i_string_length (str);
1796
1797       if (lenp)
1798         *lenp = len;
1799
1800       result = scm_strndup (scm_i_string_data (str), len);
1801     }
1802   else
1803     result = scm_to_stringn (str, lenp, NULL,
1804                              SCM_FAILED_CONVERSION_ERROR);
1805
1806   return result;
1807 }
1808 #undef FUNC_NAME
1809
1810 char *
1811 scm_to_utf8_string (SCM str)
1812 {
1813   return scm_to_utf8_stringn (str, NULL);
1814 }
1815
1816 char *
1817 scm_to_utf8_stringn (SCM str, size_t *lenp)
1818 {
1819   return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1820 }
1821
1822 scm_t_wchar *
1823 scm_to_utf32_string (SCM str)
1824 {
1825   return scm_to_utf32_stringn (str, NULL);
1826 }
1827
1828 scm_t_wchar *
1829 scm_to_utf32_stringn (SCM str, size_t *lenp)
1830 #define FUNC_NAME "scm_to_utf32_stringn"
1831 {
1832   scm_t_wchar *result;
1833
1834   SCM_VALIDATE_STRING (1, str);
1835
1836   if (scm_i_is_narrow_string (str))
1837     result = (scm_t_wchar *)
1838       scm_to_stringn (str, lenp, "UTF-32",
1839                       SCM_FAILED_CONVERSION_ERROR);
1840   else
1841     {
1842       size_t len;
1843
1844       len = scm_i_string_length (str);
1845       if (lenp)
1846         *lenp = len;
1847
1848       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1849       memcpy (result, scm_i_string_wide_chars (str),
1850               len * sizeof (scm_t_wchar));
1851       result[len] = 0;
1852     }
1853
1854   return result;
1855 }
1856 #undef FUNC_NAME
1857
1858 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1859    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
1860    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
1861    defined by HANDLER.  */
1862 char *
1863 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1864                 scm_t_string_failed_conversion_handler handler)
1865 {
1866   char *buf;
1867   size_t ilen, len, i;
1868   int ret;
1869   const char *enc;
1870
1871   if (!scm_is_string (str))
1872     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1873   ilen = scm_i_string_length (str);
1874
1875   if (ilen == 0)
1876     {
1877       buf = scm_malloc (1);
1878       buf[0] = '\0';
1879       if (lenp)
1880         *lenp = 0;
1881       return buf;
1882     }
1883
1884   if (lenp == NULL)
1885     for (i = 0; i < ilen; i++)
1886       if (scm_i_string_ref (str, i) == '\0')
1887         scm_misc_error (NULL,
1888                         "string contains #\\nul character: ~S",
1889                         scm_list_1 (str));
1890
1891   if (scm_i_is_narrow_string (str) && (encoding == NULL))
1892     {
1893       /* If using native Latin-1 encoding, just copy the string
1894          contents.  */
1895       if (lenp)
1896         {
1897           buf = scm_malloc (ilen);
1898           memcpy (buf, scm_i_string_chars (str), ilen);
1899           *lenp = ilen;
1900           return buf;
1901         }
1902       else
1903         {
1904           buf = scm_malloc (ilen + 1);
1905           memcpy (buf, scm_i_string_chars (str), ilen);
1906           buf[ilen] = '\0';
1907           return buf;
1908         }
1909     }
1910
1911
1912   buf = NULL;
1913   len = 0;
1914   enc = encoding;
1915   if (enc == NULL)
1916     enc = "ISO-8859-1";
1917   if (scm_i_is_narrow_string (str))
1918     {
1919       ret = mem_iconveh (scm_i_string_chars (str), ilen,
1920                          "ISO-8859-1", enc,
1921                          (enum iconv_ilseq_handler) handler, NULL,
1922                          &buf, &len);
1923
1924       if (ret != 0)
1925         scm_encoding_error (__func__, errno,
1926                             "cannot convert narrow string to output locale",
1927                             SCM_BOOL_F,
1928                             /* FIXME: Faulty character unknown.  */
1929                             SCM_BOOL_F);
1930     }
1931   else
1932     {
1933       buf = u32_conv_to_encoding (enc,
1934                                   (enum iconv_ilseq_handler) handler,
1935                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
1936                                   ilen,
1937                                   NULL,
1938                                   NULL, &len);
1939       if (buf == NULL)
1940         scm_encoding_error (__func__, errno,
1941                             "cannot convert wide string to output locale",
1942                             SCM_BOOL_F,
1943                             /* FIXME: Faulty character unknown.  */
1944                             SCM_BOOL_F);
1945     }
1946   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1947     {
1948       if (SCM_R6RS_ESCAPES_P)
1949         {
1950           /* The worst case is if the input string contains all 4-digit
1951              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
1952              (seven characters).  Make BUF large enough to hold
1953              that.  */
1954           buf = scm_realloc (buf, (len * 7) / 6 + 1);
1955           unistring_escapes_to_r6rs_escapes (buf, &len);
1956         }
1957       else
1958         unistring_escapes_to_guile_escapes (buf, &len);
1959
1960       buf = scm_realloc (buf, len);
1961     }
1962   if (lenp)
1963     *lenp = len;
1964   else
1965     {
1966       buf = scm_realloc (buf, len + 1);
1967       buf[len] = '\0';
1968     }
1969
1970   scm_remember_upto_here_1 (str);
1971   return buf;
1972 }
1973
1974 size_t
1975 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1976 {
1977   size_t len;
1978   char *result = NULL;
1979   if (!scm_is_string (str))
1980     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1981   result = scm_to_locale_stringn (str, &len);
1982
1983   memcpy (buf, result, (len > max_len) ? max_len : len);
1984   free (result);
1985
1986   scm_remember_upto_here_1 (str);
1987   return len;
1988 }
1989
1990 \f
1991 /* Unicode string normalization.  */
1992
1993 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1994    libguile/i18n.c.  It would be useful to have this factored out into a more
1995    convenient location, but its use of alloca makes that tricky to do. */
1996
1997 static SCM
1998 normalize_str (SCM string, uninorm_t form)
1999 {
2000   SCM ret;
2001   scm_t_uint32 *w_str;
2002   scm_t_wchar *cbuf;
2003   size_t rlen, len = scm_i_string_length (string);
2004
2005   if (scm_i_is_narrow_string (string))
2006     {
2007       size_t i;
2008       const char *buf = scm_i_string_chars (string);
2009
2010       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2011
2012       for (i = 0; i < len; i ++)
2013         w_str[i] = (unsigned char) buf[i];
2014       w_str[len] = 0;
2015     }
2016   else
2017     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2018
2019   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2020
2021   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2022   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2023   free (w_str);
2024
2025   scm_i_try_narrow_string (ret);
2026
2027   return ret;
2028 }
2029
2030 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2031             (SCM string),
2032             "Returns the NFC normalized form of @var{string}.")
2033 #define FUNC_NAME s_scm_string_normalize_nfc
2034 {
2035   SCM_VALIDATE_STRING (1, string);
2036   return normalize_str (string, UNINORM_NFC);
2037 }
2038 #undef FUNC_NAME
2039
2040 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2041             (SCM string),
2042             "Returns the NFD normalized form of @var{string}.")
2043 #define FUNC_NAME s_scm_string_normalize_nfd
2044 {
2045   SCM_VALIDATE_STRING (1, string);
2046   return normalize_str (string, UNINORM_NFD);
2047 }
2048 #undef FUNC_NAME
2049
2050 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2051             (SCM string),
2052             "Returns the NFKC normalized form of @var{string}.")
2053 #define FUNC_NAME s_scm_string_normalize_nfkc
2054 {
2055   SCM_VALIDATE_STRING (1, string);
2056   return normalize_str (string, UNINORM_NFKC);
2057 }
2058 #undef FUNC_NAME
2059
2060 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2061             (SCM string),
2062             "Returns the NFKD normalized form of @var{string}.")
2063 #define FUNC_NAME s_scm_string_normalize_nfkd
2064 {
2065   SCM_VALIDATE_STRING (1, string);
2066   return normalize_str (string, UNINORM_NFKD);
2067 }
2068 #undef FUNC_NAME
2069
2070 /* converts C scm_array of strings to SCM scm_list of strings.
2071    If argc < 0, a null terminated scm_array is assumed.
2072    The current locale encoding is assumed */
2073 SCM
2074 scm_makfromstrs (int argc, char **argv)
2075 {
2076   int i = argc;
2077   SCM lst = SCM_EOL;
2078   if (0 > i)
2079     for (i = 0; argv[i]; i++);
2080   while (i--)
2081     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2082   return lst;
2083 }
2084
2085 /* Return a newly allocated array of char pointers to each of the strings
2086    in args, with a terminating NULL pointer.  The strings are encoded using
2087    the current locale. */
2088
2089 char **
2090 scm_i_allocate_string_pointers (SCM list)
2091 #define FUNC_NAME "scm_i_allocate_string_pointers"
2092 {
2093   char **result;
2094   int list_len = scm_ilength (list);
2095   int i;
2096
2097   if (list_len < 0)
2098     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2099
2100   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2101                           "string pointers");
2102   result[list_len] = NULL;
2103
2104   /* The list might have been modified in another thread, so
2105      we check LIST before each access.
2106    */
2107   for (i = 0; i < list_len && scm_is_pair (list); i++)
2108     {
2109       SCM str = SCM_CAR (list);
2110       size_t len;  /* String length in bytes */
2111       char *c_str = scm_to_locale_stringn (str, &len);
2112
2113       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2114          scm_malloc to allocate the returned string, which must be
2115          explicitly deallocated.  This forces us to copy the string a
2116          second time into a new buffer.  Ideally there would be variants
2117          of scm_to_*_stringn that can return garbage-collected buffers. */
2118
2119       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2120       memcpy (result[i], c_str, len);
2121       result[i][len] = '\0';
2122       free (c_str);
2123
2124       list = SCM_CDR (list);
2125     }
2126
2127   return result;
2128 }
2129 #undef FUNC_NAME
2130
2131 void
2132 scm_i_get_substring_spec (size_t len,
2133                           SCM start, size_t *cstart,
2134                           SCM end, size_t *cend)
2135 {
2136   if (SCM_UNBNDP (start))
2137     *cstart = 0;
2138   else
2139     *cstart = scm_to_unsigned_integer (start, 0, len);
2140
2141   if (SCM_UNBNDP (end))
2142     *cend = len;
2143   else
2144     *cend = scm_to_unsigned_integer (end, *cstart, len);
2145 }
2146
2147 #if SCM_ENABLE_DEPRECATED
2148
2149 /* When these definitions are removed, it becomes reasonable to use
2150    read-only strings for string literals.  For that, change the reader
2151    to create string literals with scm_c_substring_read_only instead of
2152    with scm_c_substring_copy.
2153 */
2154
2155 int
2156 scm_i_deprecated_stringp (SCM str)
2157 {
2158   scm_c_issue_deprecation_warning
2159     ("SCM_STRINGP is deprecated.  Use scm_is_string instead.");
2160
2161   return scm_is_string (str);
2162 }
2163
2164 char *
2165 scm_i_deprecated_string_chars (SCM str)
2166 {
2167   char *chars;
2168
2169   scm_c_issue_deprecation_warning
2170     ("SCM_STRING_CHARS is deprecated.  See the manual for alternatives.");
2171
2172   /* We don't accept shared substrings here since they are not
2173      null-terminated.
2174   */
2175   if (IS_SH_STRING (str))
2176     scm_misc_error (NULL,
2177                     "SCM_STRING_CHARS does not work with shared substrings",
2178                     SCM_EOL);
2179
2180   /* We explicitly test for read-only strings to produce a better
2181      error message.
2182   */
2183
2184   if (IS_RO_STRING (str))
2185     scm_misc_error (NULL,
2186                     "SCM_STRING_CHARS does not work with read-only strings",
2187                     SCM_EOL);
2188
2189   /* The following is still wrong, of course...
2190    */
2191   str = scm_i_string_start_writing (str);
2192   chars = scm_i_string_writable_chars (str);
2193   scm_i_string_stop_writing ();
2194   return chars;
2195 }
2196
2197 size_t
2198 scm_i_deprecated_string_length (SCM str)
2199 {
2200   scm_c_issue_deprecation_warning
2201     ("SCM_STRING_LENGTH is deprecated.  Use scm_c_string_length instead.");
2202   return scm_c_string_length (str);
2203 }
2204
2205 #endif
2206
2207 static SCM
2208 string_handle_ref (scm_t_array_handle *h, size_t index)
2209 {
2210   return scm_c_string_ref (h->array, index);
2211 }
2212
2213 static void
2214 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2215 {
2216   scm_c_string_set_x (h->array, index, val);
2217 }
2218
2219 static void
2220 string_get_handle (SCM v, scm_t_array_handle *h)
2221 {
2222   h->array = v;
2223   h->ndims = 1;
2224   h->dims = &h->dim0;
2225   h->dim0.lbnd = 0;
2226   h->dim0.ubnd = scm_c_string_length (v) - 1;
2227   h->dim0.inc = 1;
2228   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2229   h->elements = h->writable_elements = NULL;
2230 }
2231
2232 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2233                           string_handle_ref, string_handle_set,
2234                           string_get_handle)
2235 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2236
2237 void
2238 scm_init_strings ()
2239 {
2240   scm_nullstr = scm_i_make_string (0, NULL, 0);
2241
2242 #include "libguile/strings.x"
2243 }
2244
2245
2246 /*
2247   Local Variables:
2248   c-file-style: "gnu"
2249   End:
2250 */