libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 229
 230 \f
 231 /* Copy-on-write strings.
 232  */
 233
 234 #define STRING_TAG            scm_tc7_string
 235
 236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 237 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 238 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 239
 240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 242
 243 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 244
 245 /* Read-only strings.
 246  */
 247
 248 #define RO_STRING_TAG         scm_tc7_ro_string
 249 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 250
 251 /* Mutation-sharing substrings
 252  */
 253
 254 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 255
 256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 257 /* START and LENGTH as for STRINGs. */
 258
 259 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 260
 261 SCM scm_nullstr;
 262
 263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 264    characters.  CHARSP, if not NULL, will be set to location of the
 265    char array.  If READ_ONLY_P, the returned string is read-only;
 266    otherwise it is writable.  */
 267 SCM
 268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 269 {
 270   SCM buf = make_stringbuf (len);
 271   SCM res;
 272   if (charsp)
 273     *charsp = (char *) STRINGBUF_CHARS (buf);
 274   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 275                          SCM_UNPACK (buf),
 276                          (scm_t_bits) 0, (scm_t_bits) len);
 277   return res;
 278 }
 279
 280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 281    characters.  CHARSP, if not NULL, will be set to location of the
 282    character array.  If READ_ONLY_P, the returned string is read-only;
 283    otherwise it is writable.  */
 284 SCM
 285 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 286 {
 287   SCM buf = make_wide_stringbuf (len);
 288   SCM res;
 289   if (charsp)
 290     *charsp = STRINGBUF_WIDE_CHARS (buf);
 291   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 292                          SCM_UNPACK (buf),
 293                          (scm_t_bits) 0, (scm_t_bits) len);
 294   return res;
 295 }
 296
 297 static void
 298 validate_substring_args (SCM str, size_t start, size_t end)
 299 {
 300   if (!IS_STRING (str))
 301     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 302   if (start > STRING_LENGTH (str))
 303     scm_out_of_range (NULL, scm_from_size_t (start));
 304   if (end > STRING_LENGTH (str) || end < start)
 305     scm_out_of_range (NULL, scm_from_size_t (end));
 306 }
 307
 308 static inline void
 309 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 310 {
 311   *start = STRING_START (*str);
 312   if (IS_SH_STRING (*str))
 313     {
 314       *str = SH_STRING_STRING (*str);
 315       *start += STRING_START (*str);
 316     }
 317   *buf = STRING_STRINGBUF (*str);
 318 }
 319
 320 SCM
 321 scm_i_substring (SCM str, size_t start, size_t end)
 322 {
 323   SCM buf;
 324   size_t str_start;
 325   get_str_buf_start (&str, &buf, &str_start);
 326   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 327   SET_STRINGBUF_SHARED (buf);
 328   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 329   return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 330                           (scm_t_bits)str_start + start,
 331                           (scm_t_bits) end - start);
 332 }
 333
 334 SCM
 335 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 336 {
 337   SCM buf;
 338   size_t str_start;
 339   get_str_buf_start (&str, &buf, &str_start);
 340   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 341   SET_STRINGBUF_SHARED (buf);
 342   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 343   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 344                           (scm_t_bits)str_start + start,
 345                           (scm_t_bits) end - start);
 346 }
 347
 348 SCM
 349 scm_i_substring_copy (SCM str, size_t start, size_t end)
 350 {
 351   size_t len = end - start;
 352   SCM buf, my_buf;
 353   size_t str_start;
 354   get_str_buf_start (&str, &buf, &str_start);
 355   if (scm_i_is_narrow_string (str))
 356     {
 357       my_buf = make_stringbuf (len);
 358       memcpy (STRINGBUF_CHARS (my_buf),
 359               STRINGBUF_CHARS (buf) + str_start + start, len);
 360     }
 361   else
 362     {
 363       my_buf = make_wide_stringbuf (len);
 364       u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 365                (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 366                                  + start), len);
 367       /* Even though this string is wide, the substring may be narrow.
 368          Consider adding code to narrow the string.  */
 369     }
 370   scm_remember_upto_here_1 (buf);
 371   return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 372                           (scm_t_bits) 0, (scm_t_bits) len);
 373 }
 374
 375 SCM
 376 scm_i_substring_shared (SCM str, size_t start, size_t end)
 377 {
 378   if (start == 0 && end == STRING_LENGTH (str))
 379     return str;
 380   else
 381     {
 382       size_t len = end - start;
 383       if (IS_SH_STRING (str))
 384         {
 385           start += STRING_START (str);
 386           str = SH_STRING_STRING (str);
 387         }
 388       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 389                               (scm_t_bits)start, (scm_t_bits) len);
 390     }
 391 }
 392
 393 SCM
 394 scm_c_substring (SCM str, size_t start, size_t end)
 395 {
 396   validate_substring_args (str, start, end);
 397   return scm_i_substring (str, start, end);
 398 }
 399
 400 SCM
 401 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 402 {
 403   validate_substring_args (str, start, end);
 404   return scm_i_substring_read_only (str, start, end);
 405 }
 406
 407 SCM
 408 scm_c_substring_copy (SCM str, size_t start, size_t end)
 409 {
 410   validate_substring_args (str, start, end);
 411   return scm_i_substring_copy (str, start, end);
 412 }
 413
 414 SCM
 415 scm_c_substring_shared (SCM str, size_t start, size_t end)
 416 {
 417   validate_substring_args (str, start, end);
 418   return scm_i_substring_shared (str, start, end);
 419 }
 420
 421 \f
 422 /* Internal accessors
 423  */
 424
 425 /* Returns the number of characters in STR.  This may be different
 426    than the memory size of the string storage.  */
 427 size_t
 428 scm_i_string_length (SCM str)
 429 {
 430   return STRING_LENGTH (str);
 431 }
 432
 433 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 434    encoding.  False if it is 'wide', having a 32-bit UCS-4
 435    encoding.  */
 436 int
 437 scm_i_is_narrow_string (SCM str)
 438 {
 439   if (IS_SH_STRING (str))
 440     str = SH_STRING_STRING (str);
 441
 442   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 443 }
 444
 445 /* Try to coerce a string to be narrow.  It if is narrow already, do
 446    nothing.  If it is wide, shrink it to narrow if none of its
 447    characters are above 0xFF.  Return true if the string is narrow or
 448    was made to be narrow.  */
 449 int
 450 scm_i_try_narrow_string (SCM str)
 451 {
 452   if (IS_SH_STRING (str))
 453     str = SH_STRING_STRING (str);
 454
 455   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 456
 457   return scm_i_is_narrow_string (str);
 458 }
 459
 460 /* Return a pointer to the raw data of the string, which can be either Latin-1
 461    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 462 const void *
 463 scm_i_string_data (SCM str)
 464 {
 465   SCM buf;
 466   size_t start;
 467   const char *data;
 468
 469   get_str_buf_start (&str, &buf, &start);
 470
 471   data = STRINGBUF_CONTENTS (buf);
 472   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 473
 474   return data;
 475 }
 476
 477 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 478    STR.  */
 479 const char *
 480 scm_i_string_chars (SCM str)
 481 {
 482   SCM buf;
 483   size_t start;
 484   get_str_buf_start (&str, &buf, &start);
 485   if (scm_i_is_narrow_string (str))
 486     return (const char *) STRINGBUF_CHARS (buf) + start;
 487   else
 488     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 489                     scm_list_1 (str));
 490   return NULL;
 491 }
 492
 493 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 494    STR.  */
 495 const scm_t_wchar *
 496 scm_i_string_wide_chars (SCM str)
 497 {
 498   SCM buf;
 499   size_t start;
 500
 501   get_str_buf_start (&str, &buf, &start);
 502   if (!scm_i_is_narrow_string (str))
 503     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 504   else
 505     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 506                     scm_list_1 (str));
 507 }
 508
 509 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 510    a new string buffer, so that it can be modified without modifying
 511    other strings.  Also, lock the string mutex.  Later, one must call
 512    scm_i_string_stop_writing to unlock the mutex.  */
 513 SCM
 514 scm_i_string_start_writing (SCM orig_str)
 515 {
 516   SCM buf, str = orig_str;
 517   size_t start;
 518
 519   get_str_buf_start (&str, &buf, &start);
 520   if (IS_RO_STRING (str))
 521     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 522
 523   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 524   if (STRINGBUF_SHARED (buf))
 525     {
 526       /* Clone the stringbuf.  */
 527       size_t len = STRING_LENGTH (str);
 528       SCM new_buf;
 529
 530       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 531
 532       if (scm_i_is_narrow_string (str))
 533         {
 534           new_buf = make_stringbuf (len);
 535           memcpy (STRINGBUF_CHARS (new_buf),
 536                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 537
 538         }
 539       else
 540         {
 541           new_buf = make_wide_stringbuf (len);
 542           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 543                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 544                                      + STRING_START (str)), len);
 545         }
 546
 547       SET_STRING_STRINGBUF (str, new_buf);
 548       start -= STRING_START (str);
 549
 550       /* FIXME: The following operations are not atomic, so other threads
 551          looking at STR may see an inconsistent state.  Nevertheless it can't
 552          hurt much since (i) accessing STR while it is being mutated can't
 553          yield a crash, and (ii) concurrent accesses to STR should be
 554          protected by a mutex at the application level.  The latter may not
 555          apply when STR != ORIG_STR, though.  */
 556       SET_STRING_START (str, 0);
 557       SET_STRING_STRINGBUF (str, new_buf);
 558
 559       buf = new_buf;
 560
 561       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 562     }
 563   return orig_str;
 564 }
 565
 566 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 567 char *
 568 scm_i_string_writable_chars (SCM str)
 569 {
 570   SCM buf;
 571   size_t start;
 572
 573   get_str_buf_start (&str, &buf, &start);
 574   if (scm_i_is_narrow_string (str))
 575     return (char *) STRINGBUF_CHARS (buf) + start;
 576   else
 577     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 578                     scm_list_1 (str));
 579   return NULL;
 580 }
 581
 582 /* Return a pointer to the UCS-4 codepoints of a string.  */
 583 static scm_t_wchar *
 584 scm_i_string_writable_wide_chars (SCM str)
 585 {
 586   SCM buf;
 587   size_t start;
 588
 589   get_str_buf_start (&str, &buf, &start);
 590   if (!scm_i_is_narrow_string (str))
 591     return STRINGBUF_WIDE_CHARS (buf) + start;
 592   else
 593     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 594                     scm_list_1 (str));
 595 }
 596
 597 /* Unlock the string mutex that was locked when
 598    scm_i_string_start_writing was called.  */
 599 void
 600 scm_i_string_stop_writing (void)
 601 {
 602   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 603 }
 604
 605 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 606 scm_t_wchar
 607 scm_i_string_ref (SCM str, size_t x)
 608 {
 609   if (scm_i_is_narrow_string (str))
 610     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 611   else
 612     return scm_i_string_wide_chars (str)[x];
 613 }
 614
 615 /* Returns index+1 of the first char in STR that matches C, or
 616    0 if the char is not found.  */
 617 int
 618 scm_i_string_contains_char (SCM str, char ch)
 619 {
 620   size_t i;
 621   size_t len = scm_i_string_length (str);
 622
 623   i = 0;
 624   if (scm_i_is_narrow_string (str))
 625     {
 626       while (i < len)
 627         {
 628           if (scm_i_string_chars (str)[i] == ch)
 629             return i+1;
 630           i++;
 631         }
 632     }
 633   else
 634     {
 635       while (i < len)
 636         {
 637           if (scm_i_string_wide_chars (str)[i]
 638               == (unsigned char) ch)
 639             return i+1;
 640           i++;
 641         }
 642     }
 643   return 0;
 644 }
 645
 646 int
 647 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 648 {
 649   if (scm_i_is_narrow_string (sstr))
 650     {
 651       const char *a = scm_i_string_chars (sstr) + start_x;
 652       const char *b = cstr;
 653       return strncmp (a, b, strlen(b));
 654     }
 655   else
 656     {
 657       size_t i;
 658       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 659       const char *b = cstr;
 660       for (i = 0; i < strlen (b); i++)
 661         {
 662           if (a[i] != (unsigned char) b[i])
 663             return 1;
 664         }
 665     }
 666   return 0;
 667 }
 668
 669 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 670 void
 671 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 672 {
 673   if (IS_SH_STRING (str))
 674     {
 675       p += STRING_START (str);
 676       str = SH_STRING_STRING (str);
 677     }
 678
 679   if (chr > 0xFF && scm_i_is_narrow_string (str))
 680     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 681
 682   if (scm_i_is_narrow_string (str))
 683     {
 684       char *dst = scm_i_string_writable_chars (str);
 685       dst[p] = chr;
 686     }
 687   else
 688     {
 689       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 690       dst[p] = chr;
 691     }
 692 }
 693
 694 \f
 695 /* Symbols.
 696
 697    Basic symbol creation and accessing is done here, the rest is in
 698    symbols.[hc].  This has been done to keep stringbufs and the
 699    internals of strings and string-like objects confined to this file.
 700 */
 701
 702 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 703
 704 SCM
 705 scm_i_make_symbol (SCM name, scm_t_bits flags,
 706                    unsigned long hash, SCM props)
 707 {
 708   SCM buf;
 709   size_t start = STRING_START (name);
 710   size_t length = STRING_LENGTH (name);
 711
 712   if (IS_SH_STRING (name))
 713     {
 714       name = SH_STRING_STRING (name);
 715       start += STRING_START (name);
 716     }
 717   buf = SYMBOL_STRINGBUF (name);
 718
 719   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 720     {
 721       /* reuse buf. */
 722       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 723       SET_STRINGBUF_SHARED (buf);
 724       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 725     }
 726   else
 727     {
 728       /* make new buf. */
 729       if (scm_i_is_narrow_string (name))
 730         {
 731           SCM new_buf = make_stringbuf (length);
 732           memcpy (STRINGBUF_CHARS (new_buf),
 733                   STRINGBUF_CHARS (buf) + start, length);
 734           buf = new_buf;
 735         }
 736       else
 737         {
 738           SCM new_buf = make_wide_stringbuf (length);
 739           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 740                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 741                    length);
 742           buf = new_buf;
 743         }
 744     }
 745   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 746                           (scm_t_bits) hash, SCM_UNPACK (props));
 747 }
 748
 749 SCM
 750 scm_i_c_make_symbol (const char *name, size_t len,
 751                      scm_t_bits flags, unsigned long hash, SCM props)
 752 {
 753   SCM buf = make_stringbuf (len);
 754   memcpy (STRINGBUF_CHARS (buf), name, len);
 755
 756   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 757                           (scm_t_bits) hash, SCM_UNPACK (props));
 758 }
 759
 760 /* Returns the number of characters in SYM.  This may be different
 761    from the memory size of SYM.  */
 762 size_t
 763 scm_i_symbol_length (SCM sym)
 764 {
 765   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 766 }
 767
 768 size_t
 769 scm_c_symbol_length (SCM sym)
 770 #define FUNC_NAME "scm_c_symbol_length"
 771 {
 772   SCM_VALIDATE_SYMBOL (1, sym);
 773
 774   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 775 }
 776 #undef FUNC_NAME
 777
 778 /* True if the name of SYM is stored as a Latin-1 encoded string.
 779    False if it is stored as a 32-bit UCS-4-encoded string.  */
 780 int
 781 scm_i_is_narrow_symbol (SCM sym)
 782 {
 783   SCM buf;
 784
 785   buf = SYMBOL_STRINGBUF (sym);
 786   return !STRINGBUF_WIDE (buf);
 787 }
 788
 789 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 790    contains the name of SYM.  */
 791 const char *
 792 scm_i_symbol_chars (SCM sym)
 793 {
 794   SCM buf;
 795
 796   buf = SYMBOL_STRINGBUF (sym);
 797   if (!STRINGBUF_WIDE (buf))
 798     return (const char *) STRINGBUF_CHARS (buf);
 799   else
 800     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 801                     scm_list_1 (sym));
 802 }
 803
 804 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 805    symbol's name.  */
 806 const scm_t_wchar *
 807 scm_i_symbol_wide_chars (SCM sym)
 808 {
 809   SCM buf;
 810
 811   buf = SYMBOL_STRINGBUF (sym);
 812   if (STRINGBUF_WIDE (buf))
 813     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 814   else
 815     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 816                     scm_list_1 (sym));
 817 }
 818
 819 SCM
 820 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 821 {
 822   SCM buf = SYMBOL_STRINGBUF (sym);
 823   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 824   SET_STRINGBUF_SHARED (buf);
 825   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 826   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 827                           (scm_t_bits)start, (scm_t_bits) end - start);
 828 }
 829
 830 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 831 scm_t_wchar
 832 scm_i_symbol_ref (SCM sym, size_t x)
 833 {
 834   if (scm_i_is_narrow_symbol (sym))
 835     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 836   else
 837     return scm_i_symbol_wide_chars (sym)[x];
 838 }
 839
 840 /* Debugging
 841  */
 842
 843 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 844             "Returns an association list containing debugging information\n"
 845             "for @var{str}. The association list has the following entries."
 846             "@table @code\n"
 847             "@item string\n"
 848             "The string itself.\n"
 849             "@item start\n"
 850             "The start index of the string into its stringbuf\n"
 851             "@item length\n"
 852             "The length of the string\n"
 853             "@item shared\n"
 854             "If this string is a substring, it returns its parent string.\n"
 855             "Otherwise, it returns @code{#f}\n"
 856             "@item read-only\n"
 857             "@code{#t} if the string is read-only\n"
 858             "@item stringbuf-chars\n"
 859             "A new string containing this string's stringbuf's characters\n"
 860             "@item stringbuf-length\n"
 861             "The number of characters in this stringbuf\n"
 862             "@item stringbuf-shared\n"
 863             "@code{#t} if this stringbuf is shared\n"
 864             "@item stringbuf-wide\n"
 865             "@code{#t} if this stringbuf's characters are stored in a\n"
 866             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 867             "buffer\n"
 868             "@end table")
 869 #define FUNC_NAME s_scm_sys_string_dump
 870 {
 871   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 872   SCM buf;
 873   SCM_VALIDATE_STRING (1, str);
 874
 875   /* String info */
 876   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 877                  str);
 878   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 879                  scm_from_size_t (STRING_START (str)));
 880   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 881                  scm_from_size_t (STRING_LENGTH (str)));
 882
 883   if (IS_SH_STRING (str))
 884     {
 885       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 886                      SH_STRING_STRING (str));
 887       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 888     }
 889   else
 890     {
 891       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 892                      SCM_BOOL_F);
 893       buf = STRING_STRINGBUF (str);
 894     }
 895
 896   if (IS_RO_STRING (str))
 897     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 898                    SCM_BOOL_T);
 899   else
 900     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 901                    SCM_BOOL_F);
 902
 903   /* Stringbuf info */
 904   if (!STRINGBUF_WIDE (buf))
 905     {
 906       size_t len = STRINGBUF_LENGTH (buf);
 907       char *cbuf;
 908       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 909       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 910       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 911                      sbc);
 912     }
 913   else
 914     {
 915       size_t len = STRINGBUF_LENGTH (buf);
 916       scm_t_wchar *cbuf;
 917       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 918       u32_cpy ((scm_t_uint32 *) cbuf,
 919                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 920       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 921                      sbc);
 922     }
 923   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 924                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 925   if (STRINGBUF_SHARED (buf))
 926     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 927                    SCM_BOOL_T);
 928   else
 929     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 930                    SCM_BOOL_F);
 931   if (STRINGBUF_WIDE (buf))
 932     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 933                    SCM_BOOL_T);
 934   else
 935     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 936                    SCM_BOOL_F);
 937
 938   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 939 }
 940 #undef FUNC_NAME
 941
 942 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 943             "Returns an association list containing debugging information\n"
 944             "for @var{sym}. The association list has the following entries."
 945             "@table @code\n"
 946             "@item symbol\n"
 947             "The symbol itself\n"
 948             "@item hash\n"
 949             "Its hash value\n"
 950             "@item interned\n"
 951             "@code{#t} if it is an interned symbol\n"
 952             "@item stringbuf-chars\n"
 953             "A new string containing this symbols's stringbuf's characters\n"
 954             "@item stringbuf-length\n"
 955             "The number of characters in this stringbuf\n"
 956             "@item stringbuf-shared\n"
 957             "@code{#t} if this stringbuf is shared\n"
 958             "@item stringbuf-wide\n"
 959             "@code{#t} if this stringbuf's characters are stored in a\n"
 960             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 961             "buffer\n"
 962             "@end table")
 963 #define FUNC_NAME s_scm_sys_symbol_dump
 964 {
 965   SCM e1, e2, e3, e4, e5, e6, e7;
 966   SCM buf;
 967   SCM_VALIDATE_SYMBOL (1, sym);
 968   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
 969                  sym);
 970   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
 971                  scm_from_ulong (scm_i_symbol_hash (sym)));
 972   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
 973                  scm_symbol_interned_p (sym));
 974   buf = SYMBOL_STRINGBUF (sym);
 975
 976   /* Stringbuf info */
 977   if (!STRINGBUF_WIDE (buf))
 978     {
 979       size_t len = STRINGBUF_LENGTH (buf);
 980       char *cbuf;
 981       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 982       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 983       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 984                      sbc);
 985     }
 986   else
 987     {
 988       size_t len = STRINGBUF_LENGTH (buf);
 989       scm_t_wchar *cbuf;
 990       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 991       u32_cpy ((scm_t_uint32 *) cbuf,
 992                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 993       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 994                      sbc);
 995     }
 996   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 997                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 998   if (STRINGBUF_SHARED (buf))
 999     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1000                    SCM_BOOL_T);
1001   else
1002     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1003                    SCM_BOOL_F);
1004   if (STRINGBUF_WIDE (buf))
1005     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1006                     SCM_BOOL_T);
1007   else
1008     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1009                     SCM_BOOL_F);
1010   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1011
1012 }
1013 #undef FUNC_NAME
1014
1015 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1016
1017 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1018 #define FUNC_NAME s_scm_sys_stringbuf_hist
1019 {
1020   int i;
1021   for (i = 0; i < 1000; i++)
1022     if (lenhist[i])
1023       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1024   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1025   return SCM_UNSPECIFIED;
1026 }
1027 #undef FUNC_NAME
1028
1029 #endif
1030
1031 \f
1032
1033 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1034             (SCM obj),
1035             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1036 #define FUNC_NAME s_scm_string_p
1037 {
1038   return scm_from_bool (IS_STRING (obj));
1039 }
1040 #undef FUNC_NAME
1041
1042
1043 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1044
1045 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1046             (SCM chrs),
1047             "@deffnx {Scheme Procedure} list->string chrs\n"
1048             "Return a newly allocated string composed of the arguments,\n"
1049             "@var{chrs}.")
1050 #define FUNC_NAME s_scm_string
1051 {
1052   SCM result = SCM_BOOL_F;
1053   SCM rest;
1054   size_t len;
1055   size_t p = 0;
1056   long i;
1057   int wide = 0;
1058
1059   /* Verify that this is a list of chars.  */
1060   i = scm_ilength (chrs);
1061   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1062
1063   len = (size_t) i;
1064   rest = chrs;
1065
1066   while (len > 0 && scm_is_pair (rest))
1067     {
1068       SCM elt = SCM_CAR (rest);
1069       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1070       if (SCM_CHAR (elt) > 0xFF)
1071         wide = 1;
1072       rest = SCM_CDR (rest);
1073       len--;
1074       scm_remember_upto_here_1 (elt);
1075     }
1076
1077   /* Construct a string containing this list of chars.  */
1078   len = (size_t) i;
1079   rest = chrs;
1080
1081   if (wide == 0)
1082     {
1083       char *buf;
1084
1085       result = scm_i_make_string (len, NULL, 0);
1086       result = scm_i_string_start_writing (result);
1087       buf = scm_i_string_writable_chars (result);
1088       while (len > 0 && scm_is_pair (rest))
1089         {
1090           SCM elt = SCM_CAR (rest);
1091           buf[p] = (unsigned char) SCM_CHAR (elt);
1092           p++;
1093           rest = SCM_CDR (rest);
1094           len--;
1095           scm_remember_upto_here_1 (elt);
1096         }
1097     }
1098   else
1099     {
1100       scm_t_wchar *buf;
1101
1102       result = scm_i_make_wide_string (len, NULL, 0);
1103       result = scm_i_string_start_writing (result);
1104       buf = scm_i_string_writable_wide_chars (result);
1105       while (len > 0 && scm_is_pair (rest))
1106         {
1107           SCM elt = SCM_CAR (rest);
1108           buf[p] = SCM_CHAR (elt);
1109           p++;
1110           rest = SCM_CDR (rest);
1111           len--;
1112           scm_remember_upto_here_1 (elt);
1113         }
1114     }
1115   scm_i_string_stop_writing ();
1116
1117   if (len > 0)
1118     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1119   if (!scm_is_null (rest))
1120     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1121
1122   return result;
1123 }
1124 #undef FUNC_NAME
1125
1126 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1127             (SCM k, SCM chr),
1128             "Return a newly allocated string of\n"
1129             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1130             "the string are initialized to @var{chr}, otherwise the contents\n"
1131             "of the @var{string} are all set to @var{#\nul}.")
1132 #define FUNC_NAME s_scm_make_string
1133 {
1134   return scm_c_make_string (scm_to_size_t (k), chr);
1135 }
1136 #undef FUNC_NAME
1137
1138 SCM
1139 scm_c_make_string (size_t len, SCM chr)
1140 #define FUNC_NAME NULL
1141 {
1142   size_t p;
1143   char *contents = NULL;
1144   SCM res = scm_i_make_string (len, &contents, 0);
1145
1146   /* If no char is given, initialize string contents to NULL.  */
1147   if (SCM_UNBNDP (chr))
1148     memset (contents, 0, len);
1149   else
1150     {
1151       SCM_VALIDATE_CHAR (0, chr);
1152       res = scm_i_string_start_writing (res);
1153       for (p = 0; p < len; p++)
1154         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1155       scm_i_string_stop_writing ();
1156     }
1157
1158   return res;
1159 }
1160 #undef FUNC_NAME
1161
1162 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1163             (SCM string),
1164             "Return the number of characters in @var{string}.")
1165 #define FUNC_NAME s_scm_string_length
1166 {
1167   SCM_VALIDATE_STRING (1, string);
1168   return scm_from_size_t (STRING_LENGTH (string));
1169 }
1170 #undef FUNC_NAME
1171
1172 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1173             (SCM string),
1174             "Return the bytes used to represent a character in @var{string}."
1175             "This will return 1 or 4.")
1176 #define FUNC_NAME s_scm_string_bytes_per_char
1177 {
1178   SCM_VALIDATE_STRING (1, string);
1179   if (!scm_i_is_narrow_string (string))
1180     return scm_from_int (4);
1181
1182   return scm_from_int (1);
1183 }
1184 #undef FUNC_NAME
1185
1186 size_t
1187 scm_c_string_length (SCM string)
1188 {
1189   if (!IS_STRING (string))
1190     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1191   return STRING_LENGTH (string);
1192 }
1193
1194 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1195             (SCM str, SCM k),
1196             "Return character @var{k} of @var{str} using zero-origin\n"
1197             "indexing. @var{k} must be a valid index of @var{str}.")
1198 #define FUNC_NAME s_scm_string_ref
1199 {
1200   size_t len;
1201   unsigned long idx;
1202
1203   SCM_VALIDATE_STRING (1, str);
1204
1205   len = scm_i_string_length (str);
1206   if (SCM_LIKELY (len > 0))
1207     idx = scm_to_unsigned_integer (k, 0, len - 1);
1208   else
1209     scm_out_of_range (NULL, k);
1210
1211   if (scm_i_is_narrow_string (str))
1212     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1213   else
1214     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1215 }
1216 #undef FUNC_NAME
1217
1218 SCM
1219 scm_c_string_ref (SCM str, size_t p)
1220 {
1221   if (p >= scm_i_string_length (str))
1222     scm_out_of_range (NULL, scm_from_size_t (p));
1223   if (scm_i_is_narrow_string (str))
1224     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1225   else
1226     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1227
1228 }
1229
1230 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1231             (SCM str, SCM k, SCM chr),
1232             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1233             "an unspecified value. @var{k} must be a valid index of\n"
1234             "@var{str}.")
1235 #define FUNC_NAME s_scm_string_set_x
1236 {
1237   size_t len;
1238   unsigned long idx;
1239
1240   SCM_VALIDATE_STRING (1, str);
1241
1242   len = scm_i_string_length (str);
1243   if (SCM_LIKELY (len > 0))
1244     idx = scm_to_unsigned_integer (k, 0, len - 1);
1245   else
1246     scm_out_of_range (NULL, k);
1247
1248   SCM_VALIDATE_CHAR (3, chr);
1249   str = scm_i_string_start_writing (str);
1250   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1251   scm_i_string_stop_writing ();
1252
1253   return SCM_UNSPECIFIED;
1254 }
1255 #undef FUNC_NAME
1256
1257 void
1258 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1259 {
1260   if (p >= scm_i_string_length (str))
1261     scm_out_of_range (NULL, scm_from_size_t (p));
1262   str = scm_i_string_start_writing (str);
1263   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1264   scm_i_string_stop_writing ();
1265 }
1266
1267 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1268             (SCM str, SCM start, SCM end),
1269             "Return a newly allocated string formed from the characters\n"
1270             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1271             "ending with index @var{end} (exclusive).\n"
1272             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1273             "exact integers satisfying:\n\n"
1274             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1275 #define FUNC_NAME s_scm_substring
1276 {
1277   size_t len, from, to;
1278
1279   SCM_VALIDATE_STRING (1, str);
1280   len = scm_i_string_length (str);
1281   from = scm_to_unsigned_integer (start, 0, len);
1282   if (SCM_UNBNDP (end))
1283     to = len;
1284   else
1285     to = scm_to_unsigned_integer (end, from, len);
1286   return scm_i_substring (str, from, to);
1287 }
1288 #undef FUNC_NAME
1289
1290 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1291             (SCM str, SCM start, SCM end),
1292             "Return a newly allocated string formed from the characters\n"
1293             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1294             "ending with index @var{end} (exclusive).\n"
1295             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1296             "exact integers satisfying:\n"
1297             "\n"
1298             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1299             "\n"
1300             "The returned string is read-only.\n")
1301 #define FUNC_NAME s_scm_substring_read_only
1302 {
1303   size_t len, from, to;
1304
1305   SCM_VALIDATE_STRING (1, str);
1306   len = scm_i_string_length (str);
1307   from = scm_to_unsigned_integer (start, 0, len);
1308   if (SCM_UNBNDP (end))
1309     to = len;
1310   else
1311     to = scm_to_unsigned_integer (end, from, len);
1312   return scm_i_substring_read_only (str, from, to);
1313 }
1314 #undef FUNC_NAME
1315
1316 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1317             (SCM str, SCM start, SCM end),
1318             "Return a newly allocated string formed from the characters\n"
1319             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1320             "ending with index @var{end} (exclusive).\n"
1321             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1322             "exact integers satisfying:\n\n"
1323             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1324 #define FUNC_NAME s_scm_substring_copy
1325 {
1326   /* For the Scheme version, START is mandatory, but for the C
1327      version, it is optional.  See scm_string_copy in srfi-13.c for a
1328      rationale.
1329   */
1330
1331   size_t from, to;
1332
1333   SCM_VALIDATE_STRING (1, str);
1334   scm_i_get_substring_spec (scm_i_string_length (str),
1335                             start, &from, end, &to);
1336   return scm_i_substring_copy (str, from, to);
1337 }
1338 #undef FUNC_NAME
1339
1340 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1341             (SCM str, SCM start, SCM end),
1342             "Return string that indirectly refers to the characters\n"
1343             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1344             "ending with index @var{end} (exclusive).\n"
1345             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1346             "exact integers satisfying:\n\n"
1347             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1348 #define FUNC_NAME s_scm_substring_shared
1349 {
1350   size_t len, from, to;
1351
1352   SCM_VALIDATE_STRING (1, str);
1353   len = scm_i_string_length (str);
1354   from = scm_to_unsigned_integer (start, 0, len);
1355   if (SCM_UNBNDP (end))
1356     to = len;
1357   else
1358     to = scm_to_unsigned_integer (end, from, len);
1359   return scm_i_substring_shared (str, from, to);
1360 }
1361 #undef FUNC_NAME
1362
1363 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1364             (SCM args),
1365             "Return a newly allocated string whose characters form the\n"
1366             "concatenation of the given strings, @var{args}.")
1367 #define FUNC_NAME s_scm_string_append
1368 {
1369   SCM res;
1370   size_t len = 0;
1371   int wide = 0;
1372   SCM l, s;
1373   size_t i;
1374   union
1375   {
1376     char *narrow;
1377     scm_t_wchar *wide;
1378   } data;
1379
1380   SCM_VALIDATE_REST_ARGUMENT (args);
1381   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1382     {
1383       s = SCM_CAR (l);
1384       SCM_VALIDATE_STRING (SCM_ARGn, s);
1385       len += scm_i_string_length (s);
1386       if (!scm_i_is_narrow_string (s))
1387         wide = 1;
1388     }
1389   data.narrow = NULL;
1390   if (!wide)
1391     res = scm_i_make_string (len, &data.narrow, 0);
1392   else
1393     res = scm_i_make_wide_string (len, &data.wide, 0);
1394
1395   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1396     {
1397       size_t len;
1398       s = SCM_CAR (l);
1399       SCM_VALIDATE_STRING (SCM_ARGn, s);
1400       len = scm_i_string_length (s);
1401       if (!wide)
1402         {
1403           memcpy (data.narrow, scm_i_string_chars (s), len);
1404           data.narrow += len;
1405         }
1406       else
1407         {
1408           if (scm_i_is_narrow_string (s))
1409             {
1410               for (i = 0; i < scm_i_string_length (s); i++)
1411                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1412             }
1413           else
1414             u32_cpy ((scm_t_uint32 *) data.wide,
1415                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1416           data.wide += len;
1417         }
1418       scm_remember_upto_here_1 (s);
1419     }
1420   return res;
1421 }
1422 #undef FUNC_NAME
1423
1424
1425 \f
1426 /* Charset conversion error handling.  */
1427
1428 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1429 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1430
1431 /* Raise an exception informing that character CHR could not be written
1432    to PORT in its current encoding.  */
1433 void
1434 scm_encoding_error (const char *subr, int err, const char *message,
1435                     SCM port, SCM chr)
1436 {
1437   scm_throw (scm_encoding_error_key,
1438              scm_list_n (scm_from_latin1_string (subr),
1439                          scm_from_latin1_string (message),
1440                          scm_from_int (err),
1441                          port, chr,
1442                          SCM_UNDEFINED));
1443 }
1444
1445 /* Raise an exception informing of an encoding error on PORT.  This
1446    means that a character could not be written in PORT's encoding.  */
1447 void
1448 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1449 {
1450   scm_throw (scm_decoding_error_key,
1451              scm_list_n (scm_from_latin1_string (subr),
1452                          scm_from_latin1_string (message),
1453                          scm_from_int (err),
1454                          port,
1455                          SCM_UNDEFINED));
1456 }
1457
1458 \f
1459 /* String conversion to/from C.  */
1460
1461 static void
1462 decoding_error (const char *func_name, int errno_save,
1463                 const char *str, size_t len)
1464 {
1465   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1466      handler.  */
1467   SCM bv;
1468   signed char *buf;
1469
1470   buf = scm_gc_malloc_pointerless (len, "bytevector");
1471   memcpy (buf, str, len);
1472   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1473
1474   scm_decoding_error (func_name, errno_save,
1475                       "input locale conversion error", bv);
1476 }
1477
1478 SCM
1479 scm_from_stringn (const char *str, size_t len, const char *encoding,
1480                   scm_t_string_failed_conversion_handler handler)
1481 {
1482   size_t u32len, i;
1483   scm_t_wchar *u32;
1484   int wide = 0;
1485   SCM res;
1486
1487   /* The order of these checks is important. */
1488   if (!str && len != 0)
1489     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1490   if (len == (size_t) -1)
1491     len = strlen (str);
1492   if (len == 0)
1493     return scm_nullstr;
1494
1495   if (encoding == NULL)
1496     {
1497       /* If encoding is null, use Latin-1.  */
1498       char *buf;
1499       res = scm_i_make_string (len, &buf, 0);
1500       memcpy (buf, str, len);
1501       return res;
1502     }
1503
1504   u32len = 0;
1505   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1506                                                 (enum iconv_ilseq_handler)
1507                                                 handler,
1508                                                 str, len,
1509                                                 NULL,
1510                                                 NULL, &u32len);
1511
1512   if (SCM_UNLIKELY (u32 == NULL))
1513     decoding_error (__func__, errno, str, len);
1514
1515   i = 0;
1516   while (i < u32len)
1517     if (u32[i++] > 0xFF)
1518       {
1519         wide = 1;
1520         break;
1521       }
1522
1523   if (!wide)
1524     {
1525       char *dst;
1526       res = scm_i_make_string (u32len, &dst, 0);
1527       for (i = 0; i < u32len; i ++)
1528         dst[i] = (unsigned char) u32[i];
1529       dst[u32len] = '\0';
1530     }
1531   else
1532     {
1533       scm_t_wchar *wdst;
1534       res = scm_i_make_wide_string (u32len, &wdst, 0);
1535       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1536       wdst[u32len] = 0;
1537     }
1538
1539   free (u32);
1540   return res;
1541 }
1542
1543 SCM
1544 scm_from_locale_string (const char *str)
1545 {
1546   return scm_from_locale_stringn (str, -1);
1547 }
1548
1549 SCM
1550 scm_from_locale_stringn (const char *str, size_t len)
1551 {
1552   return scm_from_stringn (str, len, locale_charset (),
1553                            scm_i_get_conversion_strategy (SCM_BOOL_F));
1554 }
1555
1556 SCM
1557 scm_from_latin1_string (const char *str)
1558 {
1559   return scm_from_latin1_stringn (str, -1);
1560 }
1561
1562 SCM
1563 scm_from_latin1_stringn (const char *str, size_t len)
1564 {
1565   char *buf;
1566   SCM result;
1567
1568   if (len == (size_t) -1)
1569     len = strlen (str);
1570
1571   /* Make a narrow string and copy STR as is.  */
1572   result = scm_i_make_string (len, &buf, 0);
1573   memcpy (buf, str, len);
1574
1575   return result;
1576 }
1577
1578 SCM
1579 scm_from_utf8_string (const char *str)
1580 {
1581   return scm_from_utf8_stringn (str, -1);
1582 }
1583
1584 SCM
1585 scm_from_utf8_stringn (const char *str, size_t len)
1586 {
1587   size_t i, char_len;
1588   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1589   int ascii = 1, narrow = 1;
1590   SCM res;
1591
1592   if (len == (size_t) -1)
1593     len = strlen (str);
1594
1595   i = 0;
1596   char_len = 0;
1597
1598   while (i < len)
1599     {
1600       if (ustr[i] <= 127)
1601         {
1602           char_len++;
1603           i++;
1604         }
1605       else
1606         {
1607           ucs4_t c;
1608           int nbytes;
1609
1610           ascii = 0;
1611
1612           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1613
1614           if (nbytes < 0)
1615             /* Bad UTF-8.  */
1616             decoding_error (__func__, errno, str, len);
1617
1618           if (c > 255)
1619             narrow = 0;
1620
1621           char_len++;
1622           i += nbytes;
1623         }
1624     }
1625
1626   if (ascii)
1627     {
1628       char *dst;
1629       res = scm_i_make_string (char_len, &dst, 0);
1630       memcpy (dst, str, len);
1631     }
1632   else if (narrow)
1633     {
1634       char *dst;
1635       size_t j;
1636       ucs4_t c;
1637
1638       res = scm_i_make_string (char_len, &dst, 0);
1639
1640       for (i = 0, j = 0; i < len; i++, j++)
1641         {
1642           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1643           dst[j] = (signed char) c;
1644         }
1645     }
1646   else
1647     {
1648       scm_t_wchar *dst;
1649       size_t j;
1650       ucs4_t c;
1651
1652       res = scm_i_make_wide_string (char_len, &dst, 0);
1653
1654       for (i = 0, j = 0; i < len; i++, j++)
1655         {
1656           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1657           dst[j] = c;
1658         }
1659     }
1660
1661   return res;
1662 }
1663
1664 SCM
1665 scm_from_utf32_string (const scm_t_wchar *str)
1666 {
1667   return scm_from_utf32_stringn (str, -1);
1668 }
1669
1670 SCM
1671 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1672 {
1673   SCM result;
1674   scm_t_wchar *buf;
1675
1676   if (len == (size_t) -1)
1677     len = u32_strlen ((uint32_t *) str);
1678
1679   result = scm_i_make_wide_string (len, &buf, 0);
1680   memcpy (buf, str, len * sizeof (scm_t_wchar));
1681   scm_i_try_narrow_string (result);
1682
1683   return result;
1684 }
1685
1686 /* Create a new scheme string from the C string STR.  The memory of
1687    STR may be used directly as storage for the new string.  */
1688 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1689    would be to register a finalizer to eventually free(3) STR, which isn't
1690    worth it.  Should we just deprecate the `scm_take_' functions?  */
1691 SCM
1692 scm_take_locale_stringn (char *str, size_t len)
1693 {
1694   SCM res;
1695
1696   res = scm_from_locale_stringn (str, len);
1697   free (str);
1698
1699   return res;
1700 }
1701
1702 SCM
1703 scm_take_locale_string (char *str)
1704 {
1705   return scm_take_locale_stringn (str, -1);
1706 }
1707
1708 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1709    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1710    Set *LENP to the size of the resulting string.
1711
1712    FIXME: This is a hack we should get rid of.  See
1713    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1714    for details.  */
1715 static void
1716 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1717 {
1718   char *before, *after;
1719   size_t i, j;
1720
1721   before = buf;
1722   after = buf;
1723   i = 0;
1724   j = 0;
1725   while (i < *lenp)
1726     {
1727       if ((i <= *lenp - 6)
1728           && before[i] == '\\'
1729           && before[i + 1] == 'u'
1730           && before[i + 2] == '0' && before[i + 3] == '0')
1731         {
1732           /* Convert \u00NN to \xNN */
1733           after[j] = '\\';
1734           after[j + 1] = 'x';
1735           after[j + 2] = tolower ((int) before[i + 4]);
1736           after[j + 3] = tolower ((int) before[i + 5]);
1737           i += 6;
1738           j += 4;
1739         }
1740       else if ((i <= *lenp - 10)
1741                && before[i] == '\\'
1742                && before[i + 1] == 'U'
1743                && before[i + 2] == '0' && before[i + 3] == '0')
1744         {
1745           /* Convert \U00NNNNNN to \UNNNNNN */
1746           after[j] = '\\';
1747           after[j + 1] = 'U';
1748           after[j + 2] = tolower ((int) before[i + 4]);
1749           after[j + 3] = tolower ((int) before[i + 5]);
1750           after[j + 4] = tolower ((int) before[i + 6]);
1751           after[j + 5] = tolower ((int) before[i + 7]);
1752           after[j + 6] = tolower ((int) before[i + 8]);
1753           after[j + 7] = tolower ((int) before[i + 9]);
1754           i += 10;
1755           j += 8;
1756         }
1757       else
1758         {
1759           after[j] = before[i];
1760           i++;
1761           j++;
1762         }
1763     }
1764   *lenp = j;
1765 }
1766
1767 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1768    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1769    of the resulting string.  BUF must be large enough to handle the
1770    worst case when `\uXXXX' escapes (6 characters) are replaced by
1771    `\xXXXX;' (7 characters).  */
1772 static void
1773 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1774 {
1775   char *before, *after;
1776   size_t i, j;
1777   /* The worst case is if the input string contains all 4-digit hex escapes.
1778      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1779   size_t max_out_len = (*lenp * 7) / 6 + 1;
1780   size_t nzeros, ndigits;
1781
1782   before = buf;
1783   after = alloca (max_out_len);
1784   i = 0;
1785   j = 0;
1786   while (i < *lenp)
1787     {
1788       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1789           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1790         {
1791           if (before[i + 1] == 'u')
1792             ndigits = 4;
1793           else if (before[i + 1] == 'U')
1794             ndigits = 8;
1795           else
1796             abort ();
1797
1798           /* Add the R6RS hex escape initial sequence.  */
1799           after[j] = '\\';
1800           after[j + 1] = 'x';
1801
1802           /* Move string positions to the start of the hex numbers.  */
1803           i += 2;
1804           j += 2;
1805
1806           /* Find the number of initial zeros in this hex number.  */
1807           nzeros = 0;
1808           while (before[i + nzeros] == '0' && nzeros < ndigits)
1809             nzeros++;
1810
1811           /* Copy the number, skipping initial zeros, and then move the string
1812              positions.  */
1813           if (nzeros == ndigits)
1814             {
1815               after[j] = '0';
1816               i += ndigits;
1817               j += 1;
1818             }
1819           else
1820             {
1821               int pos;
1822               for (pos = 0; pos < ndigits - nzeros; pos++)
1823                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1824               i += ndigits;
1825               j += (ndigits - nzeros);
1826             }
1827
1828           /* Add terminating semicolon.  */
1829           after[j] = ';';
1830           j++;
1831         }
1832       else
1833         {
1834           after[j] = before[i];
1835           i++;
1836           j++;
1837         }
1838     }
1839   *lenp = j;
1840   memcpy (before, after, j);
1841 }
1842
1843 char *
1844 scm_to_locale_string (SCM str)
1845 {
1846   return scm_to_locale_stringn (str, NULL);
1847 }
1848
1849 char *
1850 scm_to_locale_stringn (SCM str, size_t *lenp)
1851 {
1852   return scm_to_stringn (str, lenp,
1853                          locale_charset (),
1854                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1855 }
1856
1857 char *
1858 scm_to_latin1_string (SCM str)
1859 {
1860   return scm_to_latin1_stringn (str, NULL);
1861 }
1862
1863 char *
1864 scm_to_latin1_stringn (SCM str, size_t *lenp)
1865 #define FUNC_NAME "scm_to_latin1_stringn"
1866 {
1867   char *result;
1868
1869   SCM_VALIDATE_STRING (1, str);
1870
1871   if (scm_i_is_narrow_string (str))
1872     {
1873       size_t len = scm_i_string_length (str);
1874
1875       if (lenp)
1876         *lenp = len;
1877
1878       result = scm_strndup (scm_i_string_data (str), len);
1879     }
1880   else
1881     result = scm_to_stringn (str, lenp, NULL,
1882                              SCM_FAILED_CONVERSION_ERROR);
1883
1884   return result;
1885 }
1886 #undef FUNC_NAME
1887
1888 char *
1889 scm_to_utf8_string (SCM str)
1890 {
1891   return scm_to_utf8_stringn (str, NULL);
1892 }
1893
1894 char *
1895 scm_to_utf8_stringn (SCM str, size_t *lenp)
1896 {
1897   return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1898 }
1899
1900 scm_t_wchar *
1901 scm_to_utf32_string (SCM str)
1902 {
1903   return scm_to_utf32_stringn (str, NULL);
1904 }
1905
1906 scm_t_wchar *
1907 scm_to_utf32_stringn (SCM str, size_t *lenp)
1908 #define FUNC_NAME "scm_to_utf32_stringn"
1909 {
1910   scm_t_wchar *result;
1911
1912   SCM_VALIDATE_STRING (1, str);
1913
1914   if (scm_i_is_narrow_string (str))
1915     result = (scm_t_wchar *)
1916       scm_to_stringn (str, lenp, "UTF-32",
1917                       SCM_FAILED_CONVERSION_ERROR);
1918   else
1919     {
1920       size_t len;
1921
1922       len = scm_i_string_length (str);
1923       if (lenp)
1924         *lenp = len;
1925
1926       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1927       memcpy (result, scm_i_string_wide_chars (str),
1928               len * sizeof (scm_t_wchar));
1929       result[len] = 0;
1930     }
1931
1932   return result;
1933 }
1934 #undef FUNC_NAME
1935
1936 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1937    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
1938    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
1939    defined by HANDLER.  */
1940 char *
1941 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1942                 scm_t_string_failed_conversion_handler handler)
1943 {
1944   char *buf;
1945   size_t ilen, len, i;
1946   int ret;
1947   const char *enc;
1948
1949   if (!scm_is_string (str))
1950     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1951   ilen = scm_i_string_length (str);
1952
1953   if (ilen == 0)
1954     {
1955       buf = scm_malloc (1);
1956       buf[0] = '\0';
1957       if (lenp)
1958         *lenp = 0;
1959       return buf;
1960     }
1961
1962   if (lenp == NULL)
1963     for (i = 0; i < ilen; i++)
1964       if (scm_i_string_ref (str, i) == '\0')
1965         scm_misc_error (NULL,
1966                         "string contains #\\nul character: ~S",
1967                         scm_list_1 (str));
1968
1969   if (scm_i_is_narrow_string (str) && (encoding == NULL))
1970     {
1971       /* If using native Latin-1 encoding, just copy the string
1972          contents.  */
1973       if (lenp)
1974         {
1975           buf = scm_malloc (ilen);
1976           memcpy (buf, scm_i_string_chars (str), ilen);
1977           *lenp = ilen;
1978           return buf;
1979         }
1980       else
1981         {
1982           buf = scm_malloc (ilen + 1);
1983           memcpy (buf, scm_i_string_chars (str), ilen);
1984           buf[ilen] = '\0';
1985           return buf;
1986         }
1987     }
1988
1989
1990   buf = NULL;
1991   len = 0;
1992   enc = encoding;
1993   if (enc == NULL)
1994     enc = "ISO-8859-1";
1995   if (scm_i_is_narrow_string (str))
1996     {
1997       ret = mem_iconveh (scm_i_string_chars (str), ilen,
1998                          "ISO-8859-1", enc,
1999                          (enum iconv_ilseq_handler) handler, NULL,
2000                          &buf, &len);
2001
2002       if (ret != 0)
2003         scm_encoding_error (__func__, errno,
2004                             "cannot convert narrow string to output locale",
2005                             SCM_BOOL_F,
2006                             /* FIXME: Faulty character unknown.  */
2007                             SCM_BOOL_F);
2008     }
2009   else
2010     {
2011       buf = u32_conv_to_encoding (enc,
2012                                   (enum iconv_ilseq_handler) handler,
2013                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2014                                   ilen,
2015                                   NULL,
2016                                   NULL, &len);
2017       if (buf == NULL)
2018         scm_encoding_error (__func__, errno,
2019                             "cannot convert wide string to output locale",
2020                             SCM_BOOL_F,
2021                             /* FIXME: Faulty character unknown.  */
2022                             SCM_BOOL_F);
2023     }
2024   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2025     {
2026       if (SCM_R6RS_ESCAPES_P)
2027         {
2028           /* The worst case is if the input string contains all 4-digit
2029              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2030              (seven characters).  Make BUF large enough to hold
2031              that.  */
2032           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2033           unistring_escapes_to_r6rs_escapes (buf, &len);
2034         }
2035       else
2036         unistring_escapes_to_guile_escapes (buf, &len);
2037
2038       buf = scm_realloc (buf, len);
2039     }
2040   if (lenp)
2041     *lenp = len;
2042   else
2043     {
2044       buf = scm_realloc (buf, len + 1);
2045       buf[len] = '\0';
2046     }
2047
2048   scm_remember_upto_here_1 (str);
2049   return buf;
2050 }
2051
2052 size_t
2053 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2054 {
2055   size_t len;
2056   char *result = NULL;
2057   if (!scm_is_string (str))
2058     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2059   result = scm_to_locale_stringn (str, &len);
2060
2061   memcpy (buf, result, (len > max_len) ? max_len : len);
2062   free (result);
2063
2064   scm_remember_upto_here_1 (str);
2065   return len;
2066 }
2067
2068 \f
2069 /* Unicode string normalization.  */
2070
2071 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2072    libguile/i18n.c.  It would be useful to have this factored out into a more
2073    convenient location, but its use of alloca makes that tricky to do. */
2074
2075 static SCM
2076 normalize_str (SCM string, uninorm_t form)
2077 {
2078   SCM ret;
2079   scm_t_uint32 *w_str;
2080   scm_t_wchar *cbuf;
2081   size_t rlen, len = scm_i_string_length (string);
2082
2083   if (scm_i_is_narrow_string (string))
2084     {
2085       size_t i;
2086       const char *buf = scm_i_string_chars (string);
2087
2088       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2089
2090       for (i = 0; i < len; i ++)
2091         w_str[i] = (unsigned char) buf[i];
2092       w_str[len] = 0;
2093     }
2094   else
2095     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2096
2097   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2098
2099   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2100   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2101   free (w_str);
2102
2103   scm_i_try_narrow_string (ret);
2104
2105   return ret;
2106 }
2107
2108 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2109             (SCM string),
2110             "Returns the NFC normalized form of @var{string}.")
2111 #define FUNC_NAME s_scm_string_normalize_nfc
2112 {
2113   SCM_VALIDATE_STRING (1, string);
2114   return normalize_str (string, UNINORM_NFC);
2115 }
2116 #undef FUNC_NAME
2117
2118 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2119             (SCM string),
2120             "Returns the NFD normalized form of @var{string}.")
2121 #define FUNC_NAME s_scm_string_normalize_nfd
2122 {
2123   SCM_VALIDATE_STRING (1, string);
2124   return normalize_str (string, UNINORM_NFD);
2125 }
2126 #undef FUNC_NAME
2127
2128 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2129             (SCM string),
2130             "Returns the NFKC normalized form of @var{string}.")
2131 #define FUNC_NAME s_scm_string_normalize_nfkc
2132 {
2133   SCM_VALIDATE_STRING (1, string);
2134   return normalize_str (string, UNINORM_NFKC);
2135 }
2136 #undef FUNC_NAME
2137
2138 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2139             (SCM string),
2140             "Returns the NFKD normalized form of @var{string}.")
2141 #define FUNC_NAME s_scm_string_normalize_nfkd
2142 {
2143   SCM_VALIDATE_STRING (1, string);
2144   return normalize_str (string, UNINORM_NFKD);
2145 }
2146 #undef FUNC_NAME
2147
2148 /* converts C scm_array of strings to SCM scm_list of strings.
2149    If argc < 0, a null terminated scm_array is assumed.
2150    The current locale encoding is assumed */
2151 SCM
2152 scm_makfromstrs (int argc, char **argv)
2153 {
2154   int i = argc;
2155   SCM lst = SCM_EOL;
2156   if (0 > i)
2157     for (i = 0; argv[i]; i++);
2158   while (i--)
2159     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2160   return lst;
2161 }
2162
2163 /* Return a newly allocated array of char pointers to each of the strings
2164    in args, with a terminating NULL pointer.  The strings are encoded using
2165    the current locale. */
2166
2167 char **
2168 scm_i_allocate_string_pointers (SCM list)
2169 #define FUNC_NAME "scm_i_allocate_string_pointers"
2170 {
2171   char **result;
2172   int list_len = scm_ilength (list);
2173   int i;
2174
2175   if (list_len < 0)
2176     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2177
2178   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2179                           "string pointers");
2180   result[list_len] = NULL;
2181
2182   /* The list might have been modified in another thread, so
2183      we check LIST before each access.
2184    */
2185   for (i = 0; i < list_len && scm_is_pair (list); i++)
2186     {
2187       SCM str = SCM_CAR (list);
2188       size_t len;  /* String length in bytes */
2189       char *c_str = scm_to_locale_stringn (str, &len);
2190
2191       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2192          scm_malloc to allocate the returned string, which must be
2193          explicitly deallocated.  This forces us to copy the string a
2194          second time into a new buffer.  Ideally there would be variants
2195          of scm_to_*_stringn that can return garbage-collected buffers. */
2196
2197       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2198       memcpy (result[i], c_str, len);
2199       result[i][len] = '\0';
2200       free (c_str);
2201
2202       list = SCM_CDR (list);
2203     }
2204
2205   return result;
2206 }
2207 #undef FUNC_NAME
2208
2209 void
2210 scm_i_get_substring_spec (size_t len,
2211                           SCM start, size_t *cstart,
2212                           SCM end, size_t *cend)
2213 {
2214   if (SCM_UNBNDP (start))
2215     *cstart = 0;
2216   else
2217     *cstart = scm_to_unsigned_integer (start, 0, len);
2218
2219   if (SCM_UNBNDP (end))
2220     *cend = len;
2221   else
2222     *cend = scm_to_unsigned_integer (end, *cstart, len);
2223 }
2224
2225 static SCM
2226 string_handle_ref (scm_t_array_handle *h, size_t index)
2227 {
2228   return scm_c_string_ref (h->array, index);
2229 }
2230
2231 static void
2232 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2233 {
2234   scm_c_string_set_x (h->array, index, val);
2235 }
2236
2237 static void
2238 string_get_handle (SCM v, scm_t_array_handle *h)
2239 {
2240   h->array = v;
2241   h->ndims = 1;
2242   h->dims = &h->dim0;
2243   h->dim0.lbnd = 0;
2244   h->dim0.ubnd = scm_c_string_length (v) - 1;
2245   h->dim0.inc = 1;
2246   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2247   h->elements = h->writable_elements = NULL;
2248 }
2249
2250 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2251                           string_handle_ref, string_handle_set,
2252                           string_get_handle)
2253 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2254
2255 void
2256 scm_init_strings ()
2257 {
2258   scm_nullstr = scm_i_make_string (0, NULL, 0);
2259
2260 #include "libguile/strings.x"
2261 }
2262
2263
2264 /*
2265   Local Variables:
2266   c-file-style: "gnu"
2267   End:
2268 */