libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <string.h>
  26 #include <stdio.h>
  27 #include <ctype.h>
  28 #include <uninorm.h>
  29 #include <unistr.h>
  30 #include <uniconv.h>
  31
  32 #include "striconveh.h"
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/root.h"
  37 #include "libguile/strings.h"
  38 #include "libguile/error.h"
  39 #include "libguile/generalized-vectors.h"
  40 #include "libguile/deprecation.h"
  41 #include "libguile/validate.h"
  42 #include "libguile/private-options.h"
  43
  44 \f
  45
  46 /* {Strings}
  47  */
  48
  49
  50 /* Stringbufs
  51  *
  52  * XXX - keeping an accurate refcount during GC seems to be quite
  53  * tricky, so we just keep score of whether a stringbuf might be
  54  * shared, not whether it definitely is.
  55  *
  56  * The scheme I (mvo) tried to keep an accurate reference count would
  57  * recount all strings that point to a stringbuf during the mark-phase
  58  * of the GC.  This was done since one cannot access the stringbuf of
  59  * a string when that string is freed (in order to decrease the
  60  * reference count).  The memory of the stringbuf might have been
  61  * reused already for something completely different.
  62  *
  63  * This recounted worked for a small number of threads beating on
  64  * cow-strings, but it failed randomly with more than 10 threads, say.
  65  * I couldn't figure out what went wrong, so I used the conservative
  66  * approach implemented below.
  67  *
  68  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  69  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  70  * strings.
  71  */
  72
  73 /* The size in words of the stringbuf header (type tag + size).  */
  74 #define STRINGBUF_HEADER_SIZE   2U
  75
  76 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  77
  78 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  79 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  80
  81 #define STRINGBUF_TAG           scm_tc7_stringbuf
  82 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  83 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  84
  85 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  86                                  SCM_CELL_OBJECT_LOC (buf,              \
  87                                                       STRINGBUF_HEADER_SIZE))
  88 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  89 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  90
  91 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  92
  93 #define SET_STRINGBUF_SHARED(buf)                                       \
  94   do                                                                    \
  95     {                                                                   \
  96       /* Don't modify BUF if it's already marked as shared since it might be \
  97          a read-only, statically allocated stringbuf.  */               \
  98       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
  99         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 100     }                                                                   \
 101   while (0)
 102
 103 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 104 static size_t lenhist[1001];
 105 #endif
 106
 107 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 108    characters. */
 109 static SCM
 110 make_stringbuf (size_t len)
 111 {
 112   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 113      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 114      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 115      has been changed for scm_i_symbol_chars, this null-termination
 116      can be dropped.
 117   */
 118
 119   SCM buf;
 120
 121 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 122   if (len < 1000)
 123     lenhist[len]++;
 124   else
 125     lenhist[1000]++;
 126 #endif
 127
 128   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 129                                             "string"));
 130
 131   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 132   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 133
 134   STRINGBUF_CHARS (buf)[len] = 0;
 135
 136   return buf;
 137 }
 138
 139 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 140    characters.  */
 141 static SCM
 142 make_wide_stringbuf (size_t len)
 143 {
 144   SCM buf;
 145   size_t raw_len;
 146
 147 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 148   if (len < 1000)
 149     lenhist[len]++;
 150   else
 151     lenhist[1000]++;
 152 #endif
 153
 154   raw_len = (len + 1) * sizeof (scm_t_wchar);
 155   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 156                                             "string"));
 157
 158   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 159   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 160
 161   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 162
 163   return buf;
 164 }
 165
 166 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 167    characters from BUF.  */
 168 static SCM
 169 wide_stringbuf (SCM buf)
 170 {
 171   SCM new_buf;
 172
 173   if (STRINGBUF_WIDE (buf))
 174     new_buf = buf;
 175   else
 176     {
 177       size_t i, len;
 178       scm_t_wchar *mem;
 179
 180       len = STRINGBUF_LENGTH (buf);
 181
 182       new_buf = make_wide_stringbuf (len);
 183
 184       mem = STRINGBUF_WIDE_CHARS (new_buf);
 185       for (i = 0; i < len; i++)
 186         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 187       mem[len] = 0;
 188     }
 189
 190   return new_buf;
 191 }
 192
 193 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 194    characters from BUF, if possible.  */
 195 static SCM
 196 narrow_stringbuf (SCM buf)
 197 {
 198   SCM new_buf;
 199
 200   if (!STRINGBUF_WIDE (buf))
 201     new_buf = buf;
 202   else
 203     {
 204       size_t i, len;
 205       scm_t_wchar *wmem;
 206       unsigned char *mem;
 207
 208       len = STRINGBUF_LENGTH (buf);
 209       wmem = STRINGBUF_WIDE_CHARS (buf);
 210
 211       for (i = 0; i < len; i++)
 212         if (wmem[i] > 0xFF)
 213           /* BUF cannot be narrowed.  */
 214           return buf;
 215
 216       new_buf = make_stringbuf (len);
 217
 218       mem = STRINGBUF_CHARS (new_buf);
 219       for (i = 0; i < len; i++)
 220         mem[i] = (unsigned char) wmem[i];
 221       mem[len] = 0;
 222     }
 223
 224   return new_buf;
 225 }
 226
 227 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 228
 229 \f
 230 /* Copy-on-write strings.
 231  */
 232
 233 #define STRING_TAG            scm_tc7_string
 234
 235 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 236 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 237 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 238
 239 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 240 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 241
 242 #define IS_STRING(str)        (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
 243
 244 /* Read-only strings.
 245  */
 246
 247 #define RO_STRING_TAG         scm_tc7_ro_string
 248 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 249
 250 /* Mutation-sharing substrings
 251  */
 252
 253 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 254
 255 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 256 /* START and LENGTH as for STRINGs. */
 257
 258 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 259
 260 SCM scm_nullstr;
 261
 262 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 263    characters.  CHARSP, if not NULL, will be set to location of the
 264    char array.  */
 265 SCM
 266 scm_i_make_string (size_t len, char **charsp)
 267 {
 268   SCM buf = make_stringbuf (len);
 269   SCM res;
 270   if (charsp)
 271     *charsp = (char *) STRINGBUF_CHARS (buf);
 272   res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 273                          (scm_t_bits)0, (scm_t_bits) len);
 274   return res;
 275 }
 276
 277 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 278    characters.  CHARSP, if not NULL, will be set to location of the
 279    character array.  */
 280 SCM
 281 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
 282 {
 283   SCM buf = make_wide_stringbuf (len);
 284   SCM res;
 285   if (charsp)
 286     *charsp = STRINGBUF_WIDE_CHARS (buf);
 287   res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
 288                          (scm_t_bits) 0, (scm_t_bits) len);
 289   return res;
 290 }
 291
 292 static void
 293 validate_substring_args (SCM str, size_t start, size_t end)
 294 {
 295   if (!IS_STRING (str))
 296     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 297   if (start > STRING_LENGTH (str))
 298     scm_out_of_range (NULL, scm_from_size_t (start));
 299   if (end > STRING_LENGTH (str) || end < start)
 300     scm_out_of_range (NULL, scm_from_size_t (end));
 301 }
 302
 303 static inline void
 304 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 305 {
 306   *start = STRING_START (*str);
 307   if (IS_SH_STRING (*str))
 308     {
 309       *str = SH_STRING_STRING (*str);
 310       *start += STRING_START (*str);
 311     }
 312   *buf = STRING_STRINGBUF (*str);
 313 }
 314
 315 SCM
 316 scm_i_substring (SCM str, size_t start, size_t end)
 317 {
 318   SCM buf;
 319   size_t str_start;
 320   get_str_buf_start (&str, &buf, &str_start);
 321   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 322   SET_STRINGBUF_SHARED (buf);
 323   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 324   return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 325                           (scm_t_bits)str_start + start,
 326                           (scm_t_bits) end - start);
 327 }
 328
 329 SCM
 330 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 331 {
 332   SCM buf;
 333   size_t str_start;
 334   get_str_buf_start (&str, &buf, &str_start);
 335   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 336   SET_STRINGBUF_SHARED (buf);
 337   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 338   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 339                           (scm_t_bits)str_start + start,
 340                           (scm_t_bits) end - start);
 341 }
 342
 343 SCM
 344 scm_i_substring_copy (SCM str, size_t start, size_t end)
 345 {
 346   size_t len = end - start;
 347   SCM buf, my_buf;
 348   size_t str_start;
 349   get_str_buf_start (&str, &buf, &str_start);
 350   if (scm_i_is_narrow_string (str))
 351     {
 352       my_buf = make_stringbuf (len);
 353       memcpy (STRINGBUF_CHARS (my_buf),
 354               STRINGBUF_CHARS (buf) + str_start + start, len);
 355     }
 356   else
 357     {
 358       my_buf = make_wide_stringbuf (len);
 359       u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 360                (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 361                                  + start), len);
 362       /* Even though this string is wide, the substring may be narrow.
 363          Consider adding code to narrow the string.  */
 364     }
 365   scm_remember_upto_here_1 (buf);
 366   return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 367                           (scm_t_bits) 0, (scm_t_bits) len);
 368 }
 369
 370 SCM
 371 scm_i_substring_shared (SCM str, size_t start, size_t end)
 372 {
 373   if (start == 0 && end == STRING_LENGTH (str))
 374     return str;
 375   else
 376     {
 377       size_t len = end - start;
 378       if (IS_SH_STRING (str))
 379         {
 380           start += STRING_START (str);
 381           str = SH_STRING_STRING (str);
 382         }
 383       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 384                               (scm_t_bits)start, (scm_t_bits) len);
 385     }
 386 }
 387
 388 SCM
 389 scm_c_substring (SCM str, size_t start, size_t end)
 390 {
 391   validate_substring_args (str, start, end);
 392   return scm_i_substring (str, start, end);
 393 }
 394
 395 SCM
 396 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 397 {
 398   validate_substring_args (str, start, end);
 399   return scm_i_substring_read_only (str, start, end);
 400 }
 401
 402 SCM
 403 scm_c_substring_copy (SCM str, size_t start, size_t end)
 404 {
 405   validate_substring_args (str, start, end);
 406   return scm_i_substring_copy (str, start, end);
 407 }
 408
 409 SCM
 410 scm_c_substring_shared (SCM str, size_t start, size_t end)
 411 {
 412   validate_substring_args (str, start, end);
 413   return scm_i_substring_shared (str, start, end);
 414 }
 415
 416 \f
 417 /* Internal accessors
 418  */
 419
 420 /* Returns the number of characters in STR.  This may be different
 421    than the memory size of the string storage.  */
 422 size_t
 423 scm_i_string_length (SCM str)
 424 {
 425   return STRING_LENGTH (str);
 426 }
 427
 428 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 429    encoding.  False if it is 'wide', having a 32-bit UCS-4
 430    encoding.  */
 431 int
 432 scm_i_is_narrow_string (SCM str)
 433 {
 434   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 435 }
 436
 437 /* Try to coerce a string to be narrow.  It if is narrow already, do
 438    nothing.  If it is wide, shrink it to narrow if none of its
 439    characters are above 0xFF.  Return true if the string is narrow or
 440    was made to be narrow.  */
 441 int
 442 scm_i_try_narrow_string (SCM str)
 443 {
 444   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 445
 446   return scm_i_is_narrow_string (str);
 447 }
 448
 449 /* Return a pointer to the raw data of the string, which can be either Latin-1
 450    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 451 const void *
 452 scm_i_string_data (SCM str)
 453 {
 454   SCM buf;
 455   size_t start;
 456   const char *data;
 457
 458   get_str_buf_start (&str, &buf, &start);
 459
 460   data = STRINGBUF_CONTENTS (buf);
 461   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 462
 463   return data;
 464 }
 465
 466 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 467    STR.  */
 468 const char *
 469 scm_i_string_chars (SCM str)
 470 {
 471   SCM buf;
 472   size_t start;
 473   get_str_buf_start (&str, &buf, &start);
 474   if (scm_i_is_narrow_string (str))
 475     return (const char *) STRINGBUF_CHARS (buf) + start;
 476   else
 477     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 478                     scm_list_1 (str));
 479   return NULL;
 480 }
 481
 482 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 483    STR.  */
 484 const scm_t_wchar *
 485 scm_i_string_wide_chars (SCM str)
 486 {
 487   SCM buf;
 488   size_t start;
 489
 490   get_str_buf_start (&str, &buf, &start);
 491   if (!scm_i_is_narrow_string (str))
 492     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 493   else
 494     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 495                     scm_list_1 (str));
 496 }
 497
 498 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 499    a new string buffer, so that it can be modified without modifying
 500    other strings.  Also, lock the string mutex.  Later, one must call
 501    scm_i_string_stop_writing to unlock the mutex.  */
 502 SCM
 503 scm_i_string_start_writing (SCM orig_str)
 504 {
 505   SCM buf, str = orig_str;
 506   size_t start;
 507
 508   get_str_buf_start (&str, &buf, &start);
 509   if (IS_RO_STRING (str))
 510     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 511
 512   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 513   if (STRINGBUF_SHARED (buf))
 514     {
 515       /* Clone the stringbuf.  */
 516       size_t len = STRING_LENGTH (str);
 517       SCM new_buf;
 518
 519       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 520
 521       if (scm_i_is_narrow_string (str))
 522         {
 523           new_buf = make_stringbuf (len);
 524           memcpy (STRINGBUF_CHARS (new_buf),
 525                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 526
 527         }
 528       else
 529         {
 530           new_buf = make_wide_stringbuf (len);
 531           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 532                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 533                                      + STRING_START (str)), len);
 534         }
 535
 536       SET_STRING_STRINGBUF (str, new_buf);
 537       start -= STRING_START (str);
 538
 539       /* FIXME: The following operations are not atomic, so other threads
 540          looking at STR may see an inconsistent state.  Nevertheless it can't
 541          hurt much since (i) accessing STR while it is being mutated can't
 542          yield a crash, and (ii) concurrent accesses to STR should be
 543          protected by a mutex at the application level.  The latter may not
 544          apply when STR != ORIG_STR, though.  */
 545       SET_STRING_START (str, 0);
 546       SET_STRING_STRINGBUF (str, new_buf);
 547
 548       buf = new_buf;
 549
 550       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 551     }
 552   return orig_str;
 553 }
 554
 555 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 556 char *
 557 scm_i_string_writable_chars (SCM str)
 558 {
 559   SCM buf;
 560   size_t start;
 561
 562   get_str_buf_start (&str, &buf, &start);
 563   if (scm_i_is_narrow_string (str))
 564     return (char *) STRINGBUF_CHARS (buf) + start;
 565   else
 566     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 567                     scm_list_1 (str));
 568   return NULL;
 569 }
 570
 571 /* Return a pointer to the UCS-4 codepoints of a string.  */
 572 static scm_t_wchar *
 573 scm_i_string_writable_wide_chars (SCM str)
 574 {
 575   SCM buf;
 576   size_t start;
 577
 578   get_str_buf_start (&str, &buf, &start);
 579   if (!scm_i_is_narrow_string (str))
 580     return STRINGBUF_WIDE_CHARS (buf) + start;
 581   else
 582     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 583                     scm_list_1 (str));
 584 }
 585
 586 /* Unlock the string mutex that was locked when
 587    scm_i_string_start_writing was called.  */
 588 void
 589 scm_i_string_stop_writing (void)
 590 {
 591   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 592 }
 593
 594 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 595 scm_t_wchar
 596 scm_i_string_ref (SCM str, size_t x)
 597 {
 598   if (scm_i_is_narrow_string (str))
 599     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 600   else
 601     return scm_i_string_wide_chars (str)[x];
 602 }
 603
 604 /* Returns index+1 of the first char in STR that matches C, or
 605    0 if the char is not found.  */
 606 int
 607 scm_i_string_contains_char (SCM str, char ch)
 608 {
 609   size_t i;
 610   size_t len = scm_i_string_length (str);
 611
 612   i = 0;
 613   if (scm_i_is_narrow_string (str))
 614     {
 615       while (i < len)
 616         {
 617           if (scm_i_string_chars (str)[i] == ch)
 618             return i+1;
 619           i++;
 620         }
 621     }
 622   else
 623     {
 624       while (i < len)
 625         {
 626           if (scm_i_string_wide_chars (str)[i]
 627               == (unsigned char) ch)
 628             return i+1;
 629           i++;
 630         }
 631     }
 632   return 0;
 633 }
 634
 635 int
 636 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 637 {
 638   if (scm_i_is_narrow_string (sstr))
 639     {
 640       const char *a = scm_i_string_chars (sstr) + start_x;
 641       const char *b = cstr;
 642       return strncmp (a, b, strlen(b));
 643     }
 644   else
 645     {
 646       size_t i;
 647       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 648       const char *b = cstr;
 649       for (i = 0; i < strlen (b); i++)
 650         {
 651           if (a[i] != (unsigned char) b[i])
 652             return 1;
 653         }
 654     }
 655   return 0;
 656 }
 657
 658 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 659 void
 660 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 661 {
 662   if (chr > 0xFF && scm_i_is_narrow_string (str))
 663     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 664
 665   if (scm_i_is_narrow_string (str))
 666     {
 667       char *dst = scm_i_string_writable_chars (str);
 668       dst[p] = chr;
 669     }
 670   else
 671     {
 672       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 673       dst[p] = chr;
 674     }
 675 }
 676
 677 \f
 678 /* Symbols.
 679
 680    Basic symbol creation and accessing is done here, the rest is in
 681    symbols.[hc].  This has been done to keep stringbufs and the
 682    internals of strings and string-like objects confined to this file.
 683 */
 684
 685 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 686
 687 SCM
 688 scm_i_make_symbol (SCM name, scm_t_bits flags,
 689                    unsigned long hash, SCM props)
 690 {
 691   SCM buf;
 692   size_t start = STRING_START (name);
 693   size_t length = STRING_LENGTH (name);
 694
 695   if (IS_SH_STRING (name))
 696     {
 697       name = SH_STRING_STRING (name);
 698       start += STRING_START (name);
 699     }
 700   buf = SYMBOL_STRINGBUF (name);
 701
 702   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 703     {
 704       /* reuse buf. */
 705       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 706       SET_STRINGBUF_SHARED (buf);
 707       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 708     }
 709   else
 710     {
 711       /* make new buf. */
 712       if (scm_i_is_narrow_string (name))
 713         {
 714           SCM new_buf = make_stringbuf (length);
 715           memcpy (STRINGBUF_CHARS (new_buf),
 716                   STRINGBUF_CHARS (buf) + start, length);
 717           buf = new_buf;
 718         }
 719       else
 720         {
 721           SCM new_buf = make_wide_stringbuf (length);
 722           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 723                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 724                    length);
 725           buf = new_buf;
 726         }
 727     }
 728   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 729                           (scm_t_bits) hash, SCM_UNPACK (props));
 730 }
 731
 732 SCM
 733 scm_i_c_make_symbol (const char *name, size_t len,
 734                      scm_t_bits flags, unsigned long hash, SCM props)
 735 {
 736   SCM buf = make_stringbuf (len);
 737   memcpy (STRINGBUF_CHARS (buf), name, len);
 738
 739   return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 740                                     (scm_t_bits) hash, SCM_UNPACK (props));
 741 }
 742
 743 /* Returns the number of characters in SYM.  This may be different
 744    from the memory size of SYM.  */
 745 size_t
 746 scm_i_symbol_length (SCM sym)
 747 {
 748   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 749 }
 750
 751 size_t
 752 scm_c_symbol_length (SCM sym)
 753 #define FUNC_NAME "scm_c_symbol_length"
 754 {
 755   SCM_VALIDATE_SYMBOL (1, sym);
 756
 757   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 758 }
 759 #undef FUNC_NAME
 760
 761 /* True if the name of SYM is stored as a Latin-1 encoded string.
 762    False if it is stored as a 32-bit UCS-4-encoded string.  */
 763 int
 764 scm_i_is_narrow_symbol (SCM sym)
 765 {
 766   SCM buf;
 767
 768   buf = SYMBOL_STRINGBUF (sym);
 769   return !STRINGBUF_WIDE (buf);
 770 }
 771
 772 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 773    contains the name of SYM.  */
 774 const char *
 775 scm_i_symbol_chars (SCM sym)
 776 {
 777   SCM buf;
 778
 779   buf = SYMBOL_STRINGBUF (sym);
 780   if (!STRINGBUF_WIDE (buf))
 781     return (const char *) STRINGBUF_CHARS (buf);
 782   else
 783     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 784                     scm_list_1 (sym));
 785 }
 786
 787 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 788    symbol's name.  */
 789 const scm_t_wchar *
 790 scm_i_symbol_wide_chars (SCM sym)
 791 {
 792   SCM buf;
 793
 794   buf = SYMBOL_STRINGBUF (sym);
 795   if (STRINGBUF_WIDE (buf))
 796     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 797   else
 798     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 799                     scm_list_1 (sym));
 800 }
 801
 802 SCM
 803 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 804 {
 805   SCM buf = SYMBOL_STRINGBUF (sym);
 806   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 807   SET_STRINGBUF_SHARED (buf);
 808   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 809   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 810                           (scm_t_bits)start, (scm_t_bits) end - start);
 811 }
 812
 813 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 814 scm_t_wchar
 815 scm_i_symbol_ref (SCM sym, size_t x)
 816 {
 817   if (scm_i_is_narrow_symbol (sym))
 818     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 819   else
 820     return scm_i_symbol_wide_chars (sym)[x];
 821 }
 822
 823 /* Debugging
 824  */
 825
 826 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 827             "Returns an association list containing debugging information\n"
 828             "for @var{str}. The association list has the following entries."
 829             "@table @code\n"
 830             "@item string\n"
 831             "The string itself.\n"
 832             "@item start\n"
 833             "The start index of the string into its stringbuf\n"
 834             "@item length\n"
 835             "The length of the string\n"
 836             "@item shared\n"
 837             "If this string is a substring, it returns its parent string.\n"
 838             "Otherwise, it returns @code{#f}\n"
 839             "@item read-only\n"
 840             "@code{#t} if the string is read-only\n"
 841             "@item stringbuf-chars\n"
 842             "A new string containing this string's stringbuf's characters\n"
 843             "@item stringbuf-length\n"
 844             "The number of characters in this stringbuf\n"
 845             "@item stringbuf-shared\n"
 846             "@code{#t} if this stringbuf is shared\n"
 847             "@item stringbuf-wide\n"
 848             "@code{#t} if this stringbuf's characters are stored in a\n"
 849             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 850             "buffer\n"
 851             "@end table")
 852 #define FUNC_NAME s_scm_sys_string_dump
 853 {
 854   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 855   SCM buf;
 856   SCM_VALIDATE_STRING (1, str);
 857
 858   /* String info */
 859   e1 = scm_cons (scm_from_locale_symbol ("string"),
 860                  str);
 861   e2 = scm_cons (scm_from_locale_symbol ("start"),
 862                  scm_from_size_t (STRING_START (str)));
 863   e3 = scm_cons (scm_from_locale_symbol ("length"),
 864                  scm_from_size_t (STRING_LENGTH (str)));
 865
 866   if (IS_SH_STRING (str))
 867     {
 868       e4 = scm_cons (scm_from_locale_symbol ("shared"),
 869                      SH_STRING_STRING (str));
 870       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 871     }
 872   else
 873     {
 874       e4 = scm_cons (scm_from_locale_symbol ("shared"),
 875                      SCM_BOOL_F);
 876       buf = STRING_STRINGBUF (str);
 877     }
 878
 879   if (IS_RO_STRING (str))
 880     e5 = scm_cons (scm_from_locale_symbol ("read-only"),
 881                    SCM_BOOL_T);
 882   else
 883     e5 = scm_cons (scm_from_locale_symbol ("read-only"),
 884                    SCM_BOOL_F);
 885
 886   /* Stringbuf info */
 887   if (!STRINGBUF_WIDE (buf))
 888     {
 889       size_t len = STRINGBUF_LENGTH (buf);
 890       char *cbuf;
 891       SCM sbc = scm_i_make_string (len, &cbuf);
 892       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 893       e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 894                      sbc);
 895     }
 896   else
 897     {
 898       size_t len = STRINGBUF_LENGTH (buf);
 899       scm_t_wchar *cbuf;
 900       SCM sbc = scm_i_make_wide_string (len, &cbuf);
 901       u32_cpy ((scm_t_uint32 *) cbuf,
 902                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 903       e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 904                      sbc);
 905     }
 906   e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
 907                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 908   if (STRINGBUF_SHARED (buf))
 909     e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 910                    SCM_BOOL_T);
 911   else
 912     e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 913                    SCM_BOOL_F);
 914   if (STRINGBUF_WIDE (buf))
 915     e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 916                    SCM_BOOL_T);
 917   else
 918     e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 919                    SCM_BOOL_F);
 920
 921   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 922 }
 923 #undef FUNC_NAME
 924
 925 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 926             "Returns an association list containing debugging information\n"
 927             "for @var{sym}. The association list has the following entries."
 928             "@table @code\n"
 929             "@item symbol\n"
 930             "The symbol itself\n"
 931             "@item hash\n"
 932             "Its hash value\n"
 933             "@item interned\n"
 934             "@code{#t} if it is an interned symbol\n"
 935             "@item stringbuf-chars\n"
 936             "A new string containing this symbols's stringbuf's characters\n"
 937             "@item stringbuf-length\n"
 938             "The number of characters in this stringbuf\n"
 939             "@item stringbuf-shared\n"
 940             "@code{#t} if this stringbuf is shared\n"
 941             "@item stringbuf-wide\n"
 942             "@code{#t} if this stringbuf's characters are stored in a\n"
 943             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 944             "buffer\n"
 945             "@end table")
 946 #define FUNC_NAME s_scm_sys_symbol_dump
 947 {
 948   SCM e1, e2, e3, e4, e5, e6, e7;
 949   SCM buf;
 950   SCM_VALIDATE_SYMBOL (1, sym);
 951   e1 = scm_cons (scm_from_locale_symbol ("symbol"),
 952                  sym);
 953   e2 = scm_cons (scm_from_locale_symbol ("hash"),
 954                  scm_from_ulong (scm_i_symbol_hash (sym)));
 955   e3 = scm_cons (scm_from_locale_symbol ("interned"),
 956                  scm_symbol_interned_p (sym));
 957   buf = SYMBOL_STRINGBUF (sym);
 958
 959   /* Stringbuf info */
 960   if (!STRINGBUF_WIDE (buf))
 961     {
 962       size_t len = STRINGBUF_LENGTH (buf);
 963       char *cbuf;
 964       SCM sbc = scm_i_make_string (len, &cbuf);
 965       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 966       e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 967                      sbc);
 968     }
 969   else
 970     {
 971       size_t len = STRINGBUF_LENGTH (buf);
 972       scm_t_wchar *cbuf;
 973       SCM sbc = scm_i_make_wide_string (len, &cbuf);
 974       u32_cpy ((scm_t_uint32 *) cbuf,
 975                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 976       e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 977                      sbc);
 978     }
 979   e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
 980                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 981   if (STRINGBUF_SHARED (buf))
 982     e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 983                    SCM_BOOL_T);
 984   else
 985     e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 986                    SCM_BOOL_F);
 987   if (STRINGBUF_WIDE (buf))
 988     e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 989                     SCM_BOOL_T);
 990   else
 991     e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 992                     SCM_BOOL_F);
 993   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
 994
 995 }
 996 #undef FUNC_NAME
 997
 998 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 999
1000 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1001 #define FUNC_NAME s_scm_sys_stringbuf_hist
1002 {
1003   int i;
1004   for (i = 0; i < 1000; i++)
1005     if (lenhist[i])
1006       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1007   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1008   return SCM_UNSPECIFIED;
1009 }
1010 #undef FUNC_NAME
1011
1012 #endif
1013
1014 \f
1015
1016 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1017             (SCM obj),
1018             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1019 #define FUNC_NAME s_scm_string_p
1020 {
1021   return scm_from_bool (IS_STRING (obj));
1022 }
1023 #undef FUNC_NAME
1024
1025
1026 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1027
1028 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1029             (SCM chrs),
1030             "@deffnx {Scheme Procedure} list->string chrs\n"
1031             "Return a newly allocated string composed of the arguments,\n"
1032             "@var{chrs}.")
1033 #define FUNC_NAME s_scm_string
1034 {
1035   SCM result = SCM_BOOL_F;
1036   SCM rest;
1037   size_t len;
1038   size_t p = 0;
1039   long i;
1040   int wide = 0;
1041
1042   /* Verify that this is a list of chars.  */
1043   i = scm_ilength (chrs);
1044   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1045
1046   len = (size_t) i;
1047   rest = chrs;
1048
1049   while (len > 0 && scm_is_pair (rest))
1050     {
1051       SCM elt = SCM_CAR (rest);
1052       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1053       if (SCM_CHAR (elt) > 0xFF)
1054         wide = 1;
1055       rest = SCM_CDR (rest);
1056       len--;
1057       scm_remember_upto_here_1 (elt);
1058     }
1059
1060   /* Construct a string containing this list of chars.  */
1061   len = (size_t) i;
1062   rest = chrs;
1063
1064   if (wide == 0)
1065     {
1066       char *buf;
1067
1068       result = scm_i_make_string (len, NULL);
1069       result = scm_i_string_start_writing (result);
1070       buf = scm_i_string_writable_chars (result);
1071       while (len > 0 && scm_is_pair (rest))
1072         {
1073           SCM elt = SCM_CAR (rest);
1074           buf[p] = (unsigned char) SCM_CHAR (elt);
1075           p++;
1076           rest = SCM_CDR (rest);
1077           len--;
1078           scm_remember_upto_here_1 (elt);
1079         }
1080     }
1081   else
1082     {
1083       scm_t_wchar *buf;
1084
1085       result = scm_i_make_wide_string (len, NULL);
1086       result = scm_i_string_start_writing (result);
1087       buf = scm_i_string_writable_wide_chars (result);
1088       while (len > 0 && scm_is_pair (rest))
1089         {
1090           SCM elt = SCM_CAR (rest);
1091           buf[p] = SCM_CHAR (elt);
1092           p++;
1093           rest = SCM_CDR (rest);
1094           len--;
1095           scm_remember_upto_here_1 (elt);
1096         }
1097     }
1098   scm_i_string_stop_writing ();
1099
1100   if (len > 0)
1101     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1102   if (!scm_is_null (rest))
1103     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1104
1105   return result;
1106 }
1107 #undef FUNC_NAME
1108
1109 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1110             (SCM k, SCM chr),
1111             "Return a newly allocated string of\n"
1112             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1113             "the string are initialized to @var{chr}, otherwise the contents\n"
1114             "of the @var{string} are unspecified.")
1115 #define FUNC_NAME s_scm_make_string
1116 {
1117   return scm_c_make_string (scm_to_size_t (k), chr);
1118 }
1119 #undef FUNC_NAME
1120
1121 SCM
1122 scm_c_make_string (size_t len, SCM chr)
1123 #define FUNC_NAME NULL
1124 {
1125   size_t p;
1126   SCM res = scm_i_make_string (len, NULL);
1127
1128   if (!SCM_UNBNDP (chr))
1129     {
1130       SCM_VALIDATE_CHAR (0, chr);
1131       res = scm_i_string_start_writing (res);
1132       for (p = 0; p < len; p++)
1133         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1134       scm_i_string_stop_writing ();
1135     }
1136
1137   return res;
1138 }
1139 #undef FUNC_NAME
1140
1141 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1142             (SCM string),
1143             "Return the number of characters in @var{string}.")
1144 #define FUNC_NAME s_scm_string_length
1145 {
1146   SCM_VALIDATE_STRING (1, string);
1147   return scm_from_size_t (STRING_LENGTH (string));
1148 }
1149 #undef FUNC_NAME
1150
1151 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1152             (SCM string),
1153             "Return the bytes used to represent a character in @var{string}."
1154             "This will return 1 or 4.")
1155 #define FUNC_NAME s_scm_string_bytes_per_char
1156 {
1157   SCM_VALIDATE_STRING (1, string);
1158   if (!scm_i_is_narrow_string (string))
1159     return scm_from_int (4);
1160
1161   return scm_from_int (1);
1162 }
1163 #undef FUNC_NAME
1164
1165 size_t
1166 scm_c_string_length (SCM string)
1167 {
1168   if (!IS_STRING (string))
1169     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1170   return STRING_LENGTH (string);
1171 }
1172
1173 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1174             (SCM str, SCM k),
1175             "Return character @var{k} of @var{str} using zero-origin\n"
1176             "indexing. @var{k} must be a valid index of @var{str}.")
1177 #define FUNC_NAME s_scm_string_ref
1178 {
1179   size_t len;
1180   unsigned long idx;
1181
1182   SCM_VALIDATE_STRING (1, str);
1183
1184   len = scm_i_string_length (str);
1185   if (SCM_LIKELY (len > 0))
1186     idx = scm_to_unsigned_integer (k, 0, len - 1);
1187   else
1188     scm_out_of_range (NULL, k);
1189
1190   if (scm_i_is_narrow_string (str))
1191     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1192   else
1193     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1194 }
1195 #undef FUNC_NAME
1196
1197 SCM
1198 scm_c_string_ref (SCM str, size_t p)
1199 {
1200   if (p >= scm_i_string_length (str))
1201     scm_out_of_range (NULL, scm_from_size_t (p));
1202   if (scm_i_is_narrow_string (str))
1203     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1204   else
1205     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1206
1207 }
1208
1209 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1210             (SCM str, SCM k, SCM chr),
1211             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1212             "an unspecified value. @var{k} must be a valid index of\n"
1213             "@var{str}.")
1214 #define FUNC_NAME s_scm_string_set_x
1215 {
1216   size_t len;
1217   unsigned long idx;
1218
1219   SCM_VALIDATE_STRING (1, str);
1220
1221   len = scm_i_string_length (str);
1222   if (SCM_LIKELY (len > 0))
1223     idx = scm_to_unsigned_integer (k, 0, len - 1);
1224   else
1225     scm_out_of_range (NULL, k);
1226
1227   SCM_VALIDATE_CHAR (3, chr);
1228   str = scm_i_string_start_writing (str);
1229   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1230   scm_i_string_stop_writing ();
1231
1232   return SCM_UNSPECIFIED;
1233 }
1234 #undef FUNC_NAME
1235
1236 void
1237 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1238 {
1239   if (p >= scm_i_string_length (str))
1240     scm_out_of_range (NULL, scm_from_size_t (p));
1241   str = scm_i_string_start_writing (str);
1242   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1243   scm_i_string_stop_writing ();
1244 }
1245
1246 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1247             (SCM str, SCM start, SCM end),
1248             "Return a newly allocated string formed from the characters\n"
1249             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1250             "ending with index @var{end} (exclusive).\n"
1251             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1252             "exact integers satisfying:\n\n"
1253             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1254 #define FUNC_NAME s_scm_substring
1255 {
1256   size_t len, from, to;
1257
1258   SCM_VALIDATE_STRING (1, str);
1259   len = scm_i_string_length (str);
1260   from = scm_to_unsigned_integer (start, 0, len);
1261   if (SCM_UNBNDP (end))
1262     to = len;
1263   else
1264     to = scm_to_unsigned_integer (end, from, len);
1265   return scm_i_substring (str, from, to);
1266 }
1267 #undef FUNC_NAME
1268
1269 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1270             (SCM str, SCM start, SCM end),
1271             "Return a newly allocated string formed from the characters\n"
1272             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1273             "ending with index @var{end} (exclusive).\n"
1274             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1275             "exact integers satisfying:\n"
1276             "\n"
1277             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1278             "\n"
1279             "The returned string is read-only.\n")
1280 #define FUNC_NAME s_scm_substring_read_only
1281 {
1282   size_t len, from, to;
1283
1284   SCM_VALIDATE_STRING (1, str);
1285   len = scm_i_string_length (str);
1286   from = scm_to_unsigned_integer (start, 0, len);
1287   if (SCM_UNBNDP (end))
1288     to = len;
1289   else
1290     to = scm_to_unsigned_integer (end, from, len);
1291   return scm_i_substring_read_only (str, from, to);
1292 }
1293 #undef FUNC_NAME
1294
1295 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1296             (SCM str, SCM start, SCM end),
1297             "Return a newly allocated string formed from the characters\n"
1298             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1299             "ending with index @var{end} (exclusive).\n"
1300             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1301             "exact integers satisfying:\n\n"
1302             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1303 #define FUNC_NAME s_scm_substring_copy
1304 {
1305   /* For the Scheme version, START is mandatory, but for the C
1306      version, it is optional.  See scm_string_copy in srfi-13.c for a
1307      rationale.
1308   */
1309
1310   size_t from, to;
1311
1312   SCM_VALIDATE_STRING (1, str);
1313   scm_i_get_substring_spec (scm_i_string_length (str),
1314                             start, &from, end, &to);
1315   return scm_i_substring_copy (str, from, to);
1316 }
1317 #undef FUNC_NAME
1318
1319 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1320             (SCM str, SCM start, SCM end),
1321             "Return string that indirectly refers to the characters\n"
1322             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323             "ending with index @var{end} (exclusive).\n"
1324             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325             "exact integers satisfying:\n\n"
1326             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1327 #define FUNC_NAME s_scm_substring_shared
1328 {
1329   size_t len, from, to;
1330
1331   SCM_VALIDATE_STRING (1, str);
1332   len = scm_i_string_length (str);
1333   from = scm_to_unsigned_integer (start, 0, len);
1334   if (SCM_UNBNDP (end))
1335     to = len;
1336   else
1337     to = scm_to_unsigned_integer (end, from, len);
1338   return scm_i_substring_shared (str, from, to);
1339 }
1340 #undef FUNC_NAME
1341
1342 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1343             (SCM args),
1344             "Return a newly allocated string whose characters form the\n"
1345             "concatenation of the given strings, @var{args}.")
1346 #define FUNC_NAME s_scm_string_append
1347 {
1348   SCM res;
1349   size_t len = 0;
1350   int wide = 0;
1351   SCM l, s;
1352   size_t i;
1353   union
1354   {
1355     char *narrow;
1356     scm_t_wchar *wide;
1357   } data;
1358
1359   SCM_VALIDATE_REST_ARGUMENT (args);
1360   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1361     {
1362       s = SCM_CAR (l);
1363       SCM_VALIDATE_STRING (SCM_ARGn, s);
1364       len += scm_i_string_length (s);
1365       if (!scm_i_is_narrow_string (s))
1366         wide = 1;
1367     }
1368   data.narrow = NULL;
1369   if (!wide)
1370     res = scm_i_make_string (len, &data.narrow);
1371   else
1372     res = scm_i_make_wide_string (len, &data.wide);
1373
1374   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1375     {
1376       size_t len;
1377       s = SCM_CAR (l);
1378       SCM_VALIDATE_STRING (SCM_ARGn, s);
1379       len = scm_i_string_length (s);
1380       if (!wide)
1381         {
1382           memcpy (data.narrow, scm_i_string_chars (s), len);
1383           data.narrow += len;
1384         }
1385       else
1386         {
1387           if (scm_i_is_narrow_string (s))
1388             {
1389               for (i = 0; i < scm_i_string_length (s); i++)
1390                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1391             }
1392           else
1393             u32_cpy ((scm_t_uint32 *) data.wide,
1394                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1395           data.wide += len;
1396         }
1397       scm_remember_upto_here_1 (s);
1398     }
1399   return res;
1400 }
1401 #undef FUNC_NAME
1402
1403 int
1404 scm_is_string (SCM obj)
1405 {
1406   return IS_STRING (obj);
1407 }
1408
1409 \f
1410 /* Conversion to/from other encodings.  */
1411
1412 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1413 void
1414 scm_encoding_error (const char *subr, int err, const char *message,
1415                     const char *from, const char *to, SCM string_or_bv)
1416 {
1417   /* Raise an exception that conveys all the information needed to debug the
1418      problem.  Only perform locale conversions that are safe; in particular,
1419      don't try to display STRING_OR_BV when it's a string since converting it to
1420      the output locale may fail.  */
1421   scm_throw (scm_encoding_error_key,
1422              scm_list_n (scm_from_locale_string (subr),
1423                          scm_from_locale_string (message),
1424                          scm_from_int (err),
1425                          scm_from_locale_string (from),
1426                          scm_from_locale_string (to),
1427                          string_or_bv,
1428                          SCM_UNDEFINED));
1429 }
1430
1431 SCM
1432 scm_from_stringn (const char *str, size_t len, const char *encoding,
1433                   scm_t_string_failed_conversion_handler handler)
1434 {
1435   size_t u32len, i;
1436   scm_t_wchar *u32;
1437   int wide = 0;
1438   SCM res;
1439
1440   if (len == 0)
1441     return scm_nullstr;
1442
1443   if (encoding == NULL)
1444     {
1445       /* If encoding is null, use Latin-1.  */
1446       char *buf;
1447       res = scm_i_make_string (len, &buf);
1448       memcpy (buf, str, len);
1449       return res;
1450     }
1451
1452   u32len = 0;
1453   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1454                                                 (enum iconv_ilseq_handler)
1455                                                 handler,
1456                                                 str, len,
1457                                                 NULL,
1458                                                 NULL, &u32len);
1459
1460   if (SCM_UNLIKELY (u32 == NULL))
1461     {
1462       /* Raise an error and pass the raw C string as a bytevector to the `throw'
1463          handler.  */
1464       SCM bv;
1465       signed char *buf;
1466
1467       buf = scm_gc_malloc_pointerless (len, "bytevector");
1468       memcpy (buf, str, len);
1469       bv = scm_c_take_bytevector (buf, len);
1470
1471       scm_encoding_error (__func__, errno,
1472                           "input locale conversion error",
1473                           encoding, "UTF-32", bv);
1474     }
1475
1476   i = 0;
1477   while (i < u32len)
1478     if (u32[i++] > 0xFF)
1479       {
1480         wide = 1;
1481         break;
1482       }
1483
1484   if (!wide)
1485     {
1486       char *dst;
1487       res = scm_i_make_string (u32len, &dst);
1488       for (i = 0; i < u32len; i ++)
1489         dst[i] = (unsigned char) u32[i];
1490       dst[u32len] = '\0';
1491     }
1492   else
1493     {
1494       scm_t_wchar *wdst;
1495       res = scm_i_make_wide_string (u32len, &wdst);
1496       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1497       wdst[u32len] = 0;
1498     }
1499
1500   free (u32);
1501   return res;
1502 }
1503
1504 SCM
1505 scm_from_latin1_stringn (const char *str, size_t len)
1506 {
1507   return scm_from_stringn (str, len, NULL, SCM_FAILED_CONVERSION_ERROR);
1508 }
1509
1510 SCM
1511 scm_from_locale_stringn (const char *str, size_t len)
1512 {
1513   const char *enc;
1514   scm_t_string_failed_conversion_handler hndl;
1515   SCM inport;
1516   scm_t_port *pt;
1517
1518   if (len == (size_t) -1)
1519     len = strlen (str);
1520   if (len == 0)
1521     return scm_nullstr;
1522
1523   inport = scm_current_input_port ();
1524   if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1525     {
1526       pt = SCM_PTAB_ENTRY (inport);
1527       enc = pt->encoding;
1528       hndl = pt->ilseq_handler;
1529     }
1530   else
1531     {
1532       enc = NULL;
1533       hndl = SCM_FAILED_CONVERSION_ERROR;
1534     }
1535
1536   return scm_from_stringn (str, len, enc, hndl);
1537 }
1538
1539 SCM
1540 scm_from_locale_string (const char *str)
1541 {
1542   if (str == NULL)
1543     return scm_nullstr;
1544
1545   return scm_from_locale_stringn (str, -1);
1546 }
1547
1548 SCM
1549 scm_i_from_utf8_string (const scm_t_uint8 *str)
1550 {
1551   return scm_from_stringn ((const char *) str,
1552                            strlen ((char *) str), "UTF-8",
1553                            SCM_FAILED_CONVERSION_ERROR);
1554 }
1555
1556 /* Create a new scheme string from the C string STR.  The memory of
1557    STR may be used directly as storage for the new string.  */
1558 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1559    would be to register a finalizer to eventually free(3) STR, which isn't
1560    worth it.  Should we just deprecate the `scm_take_' functions?  */
1561 SCM
1562 scm_take_locale_stringn (char *str, size_t len)
1563 {
1564   SCM res;
1565
1566   res = scm_from_locale_stringn (str, len);
1567   free (str);
1568
1569   return res;
1570 }
1571
1572 SCM
1573 scm_take_locale_string (char *str)
1574 {
1575   return scm_take_locale_stringn (str, -1);
1576 }
1577
1578 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1579    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1580    Set *LENP to the size of the resulting string.  */
1581 void
1582 scm_i_unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1583 {
1584   char *before, *after;
1585   size_t i, j;
1586
1587   before = buf;
1588   after = buf;
1589   i = 0;
1590   j = 0;
1591   while (i < *lenp)
1592     {
1593       if ((i <= *lenp - 6)
1594           && before[i] == '\\'
1595           && before[i + 1] == 'u'
1596           && before[i + 2] == '0' && before[i + 3] == '0')
1597         {
1598           /* Convert \u00NN to \xNN */
1599           after[j] = '\\';
1600           after[j + 1] = 'x';
1601           after[j + 2] = tolower ((int) before[i + 4]);
1602           after[j + 3] = tolower ((int) before[i + 5]);
1603           i += 6;
1604           j += 4;
1605         }
1606       else if ((i <= *lenp - 10)
1607                && before[i] == '\\'
1608                && before[i + 1] == 'U'
1609                && before[i + 2] == '0' && before[i + 3] == '0')
1610         {
1611           /* Convert \U00NNNNNN to \UNNNNNN */
1612           after[j] = '\\';
1613           after[j + 1] = 'U';
1614           after[j + 2] = tolower ((int) before[i + 4]);
1615           after[j + 3] = tolower ((int) before[i + 5]);
1616           after[j + 4] = tolower ((int) before[i + 6]);
1617           after[j + 5] = tolower ((int) before[i + 7]);
1618           after[j + 6] = tolower ((int) before[i + 8]);
1619           after[j + 7] = tolower ((int) before[i + 9]);
1620           i += 10;
1621           j += 8;
1622         }
1623       else
1624         {
1625           after[j] = before[i];
1626           i++;
1627           j++;
1628         }
1629     }
1630   *lenp = j;
1631 }
1632
1633 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1634    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1635    of the resulting string.  BUF must be large enough to handle the
1636    worst case when `\uXXXX' escapes (6 characters) are replaced by
1637    `\xXXXX;' (7 characters).  */
1638 void
1639 scm_i_unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1640 {
1641   char *before, *after;
1642   size_t i, j;
1643   /* The worst case is if the input string contains all 4-digit hex escapes.
1644      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1645   size_t max_out_len = (*lenp * 7) / 6 + 1;
1646   size_t nzeros, ndigits;
1647
1648   before = buf;
1649   after = alloca (max_out_len);
1650   i = 0;
1651   j = 0;
1652   while (i < *lenp)
1653     {
1654       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1655           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1656         {
1657           if (before[i + 1] == 'u')
1658             ndigits = 4;
1659           else if (before[i + 1] == 'U')
1660             ndigits = 8;
1661           else
1662             abort ();
1663
1664           /* Add the R6RS hex escape initial sequence.  */
1665           after[j] = '\\';
1666           after[j + 1] = 'x';
1667
1668           /* Move string positions to the start of the hex numbers.  */
1669           i += 2;
1670           j += 2;
1671
1672           /* Find the number of initial zeros in this hex number.  */
1673           nzeros = 0;
1674           while (before[i + nzeros] == '0' && nzeros < ndigits)
1675             nzeros++;
1676
1677           /* Copy the number, skipping initial zeros, and then move the string
1678              positions.  */
1679           if (nzeros == ndigits)
1680             {
1681               after[j] = '0';
1682               i += ndigits;
1683               j += 1;
1684             }
1685           else
1686             {
1687               int pos;
1688               for (pos = 0; pos < ndigits - nzeros; pos++)
1689                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1690               i += ndigits;
1691               j += (ndigits - nzeros);
1692             }
1693
1694           /* Add terminating semicolon.  */
1695           after[j] = ';';
1696           j++;
1697         }
1698       else
1699         {
1700           after[j] = before[i];
1701           i++;
1702           j++;
1703         }
1704     }
1705   *lenp = j;
1706   memcpy (before, after, j);
1707 }
1708
1709 char *
1710 scm_to_latin1_stringn (SCM str, size_t *lenp)
1711 {
1712   return scm_to_stringn (str, lenp, NULL, SCM_FAILED_CONVERSION_ERROR);
1713 }
1714
1715 char *
1716 scm_to_locale_stringn (SCM str, size_t *lenp)
1717 {
1718   SCM outport;
1719   scm_t_port *pt;
1720   const char *enc;
1721
1722   outport = scm_current_output_port ();
1723   if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1724     {
1725       pt = SCM_PTAB_ENTRY (outport);
1726       enc = pt->encoding;
1727     }
1728   else
1729     enc = NULL;
1730
1731   return scm_to_stringn (str, lenp,
1732                          enc,
1733                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1734 }
1735
1736 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1737    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
1738    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
1739    defined by HANDLER.  */
1740 char *
1741 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1742                 scm_t_string_failed_conversion_handler handler)
1743 {
1744   char *buf;
1745   size_t ilen, len, i;
1746   int ret;
1747   const char *enc;
1748
1749   if (!scm_is_string (str))
1750     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1751   ilen = scm_i_string_length (str);
1752
1753   if (ilen == 0)
1754     {
1755       buf = scm_malloc (1);
1756       buf[0] = '\0';
1757       if (lenp)
1758         *lenp = 0;
1759       return buf;
1760     }
1761
1762   if (lenp == NULL)
1763     for (i = 0; i < ilen; i++)
1764       if (scm_i_string_ref (str, i) == '\0')
1765         scm_misc_error (NULL,
1766                         "string contains #\\nul character: ~S",
1767                         scm_list_1 (str));
1768
1769   if (scm_i_is_narrow_string (str) && (encoding == NULL))
1770     {
1771       /* If using native Latin-1 encoding, just copy the string
1772          contents.  */
1773       if (lenp)
1774         {
1775           buf = scm_malloc (ilen);
1776           memcpy (buf, scm_i_string_chars (str), ilen);
1777           *lenp = ilen;
1778           return buf;
1779         }
1780       else
1781         {
1782           buf = scm_malloc (ilen + 1);
1783           memcpy (buf, scm_i_string_chars (str), ilen);
1784           buf[ilen] = '\0';
1785           return buf;
1786         }
1787     }
1788
1789
1790   buf = NULL;
1791   len = 0;
1792   enc = encoding;
1793   if (enc == NULL)
1794     enc = "ISO-8859-1";
1795   if (scm_i_is_narrow_string (str))
1796     {
1797       ret = mem_iconveh (scm_i_string_chars (str), ilen,
1798                          "ISO-8859-1", enc,
1799                          (enum iconv_ilseq_handler) handler, NULL,
1800                          &buf, &len);
1801
1802       if (ret != 0)
1803         scm_encoding_error (__func__, errno,
1804                             "cannot convert to output locale",
1805                             "ISO-8859-1", enc, str);
1806     }
1807   else
1808     {
1809       buf = u32_conv_to_encoding (enc,
1810                                   (enum iconv_ilseq_handler) handler,
1811                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
1812                                   ilen,
1813                                   NULL,
1814                                   NULL, &len);
1815       if (buf == NULL)
1816         scm_encoding_error (__func__, errno,
1817                             "cannot convert to output locale",
1818                             "UTF-32", enc, str);
1819     }
1820   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1821     {
1822       if (SCM_R6RS_ESCAPES_P)
1823         {
1824           /* The worst case is if the input string contains all 4-digit
1825              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
1826              (seven characters).  Make BUF large enough to hold
1827              that.  */
1828           buf = scm_realloc (buf, (len * 7) / 6 + 1);
1829           scm_i_unistring_escapes_to_r6rs_escapes (buf, &len);
1830         }
1831       else
1832         scm_i_unistring_escapes_to_guile_escapes (buf, &len);
1833
1834       buf = scm_realloc (buf, len);
1835     }
1836   if (lenp)
1837     *lenp = len;
1838   else
1839     {
1840       buf = scm_realloc (buf, len + 1);
1841       buf[len] = '\0';
1842     }
1843
1844   scm_remember_upto_here_1 (str);
1845   return buf;
1846 }
1847
1848 char *
1849 scm_to_locale_string (SCM str)
1850 {
1851   return scm_to_locale_stringn (str, NULL);
1852 }
1853
1854 scm_t_uint8 *
1855 scm_i_to_utf8_string (SCM str)
1856 {
1857   char *u8str;
1858   u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1859   return (scm_t_uint8 *) u8str;
1860 }
1861
1862 size_t
1863 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1864 {
1865   size_t len;
1866   char *result = NULL;
1867   if (!scm_is_string (str))
1868     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1869   result = scm_to_locale_stringn (str, &len);
1870
1871   memcpy (buf, result, (len > max_len) ? max_len : len);
1872   free (result);
1873
1874   scm_remember_upto_here_1 (str);
1875   return len;
1876 }
1877
1878 \f
1879 /* Unicode string normalization.  */
1880
1881 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1882    libguile/i18n.c.  It would be useful to have this factored out into a more
1883    convenient location, but its use of alloca makes that tricky to do. */
1884
1885 static SCM
1886 normalize_str (SCM string, uninorm_t form)
1887 {
1888   SCM ret;
1889   scm_t_uint32 *w_str;
1890   scm_t_wchar *cbuf;
1891   size_t rlen, len = scm_i_string_length (string);
1892
1893   if (scm_i_is_narrow_string (string))
1894     {
1895       size_t i;
1896       const char *buf = scm_i_string_chars (string);
1897
1898       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1899
1900       for (i = 0; i < len; i ++)
1901         w_str[i] = (unsigned char) buf[i];
1902       w_str[len] = 0;
1903     }
1904   else
1905     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1906
1907   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1908
1909   ret = scm_i_make_wide_string (rlen, &cbuf);
1910   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1911   free (w_str);
1912
1913   scm_i_try_narrow_string (ret);
1914
1915   return ret;
1916 }
1917
1918 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1919             (SCM string),
1920             "Returns the NFC normalized form of @var{string}.")
1921 #define FUNC_NAME s_scm_string_normalize_nfc
1922 {
1923   SCM_VALIDATE_STRING (1, string);
1924   return normalize_str (string, UNINORM_NFC);
1925 }
1926 #undef FUNC_NAME
1927
1928 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1929             (SCM string),
1930             "Returns the NFD normalized form of @var{string}.")
1931 #define FUNC_NAME s_scm_string_normalize_nfd
1932 {
1933   SCM_VALIDATE_STRING (1, string);
1934   return normalize_str (string, UNINORM_NFD);
1935 }
1936 #undef FUNC_NAME
1937
1938 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1939             (SCM string),
1940             "Returns the NFKC normalized form of @var{string}.")
1941 #define FUNC_NAME s_scm_string_normalize_nfkc
1942 {
1943   SCM_VALIDATE_STRING (1, string);
1944   return normalize_str (string, UNINORM_NFKC);
1945 }
1946 #undef FUNC_NAME
1947
1948 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1949             (SCM string),
1950             "Returns the NFKD normalized form of @var{string}.")
1951 #define FUNC_NAME s_scm_string_normalize_nfkd
1952 {
1953   SCM_VALIDATE_STRING (1, string);
1954   return normalize_str (string, UNINORM_NFKD);
1955 }
1956 #undef FUNC_NAME
1957
1958 /* converts C scm_array of strings to SCM scm_list of strings. */
1959 /* If argc < 0, a null terminated scm_array is assumed. */
1960 SCM
1961 scm_makfromstrs (int argc, char **argv)
1962 {
1963   int i = argc;
1964   SCM lst = SCM_EOL;
1965   if (0 > i)
1966     for (i = 0; argv[i]; i++);
1967   while (i--)
1968     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1969   return lst;
1970 }
1971
1972 /* Return a newly allocated array of char pointers to each of the strings
1973    in args, with a terminating NULL pointer.  */
1974
1975 char **
1976 scm_i_allocate_string_pointers (SCM list)
1977 #define FUNC_NAME "scm_i_allocate_string_pointers"
1978 {
1979   char **result;
1980   int len = scm_ilength (list);
1981   int i;
1982
1983   if (len < 0)
1984     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1985
1986   result = scm_gc_malloc ((len + 1) * sizeof (char *),
1987                           "string pointers");
1988   result[len] = NULL;
1989
1990   /* The list might be have been modified in another thread, so
1991      we check LIST before each access.
1992    */
1993   for (i = 0; i < len && scm_is_pair (list); i++)
1994     {
1995       SCM str;
1996       size_t len;
1997
1998       str = SCM_CAR (list);
1999       len = scm_c_string_length (str);
2000
2001       result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
2002       memcpy (result[i], scm_i_string_chars (str), len);
2003       result[i][len] = '\0';
2004
2005       list = SCM_CDR (list);
2006     }
2007
2008   return result;
2009 }
2010 #undef FUNC_NAME
2011
2012 void
2013 scm_i_get_substring_spec (size_t len,
2014                           SCM start, size_t *cstart,
2015                           SCM end, size_t *cend)
2016 {
2017   if (SCM_UNBNDP (start))
2018     *cstart = 0;
2019   else
2020     *cstart = scm_to_unsigned_integer (start, 0, len);
2021
2022   if (SCM_UNBNDP (end))
2023     *cend = len;
2024   else
2025     *cend = scm_to_unsigned_integer (end, *cstart, len);
2026 }
2027
2028 #if SCM_ENABLE_DEPRECATED
2029
2030 /* When these definitions are removed, it becomes reasonable to use
2031    read-only strings for string literals.  For that, change the reader
2032    to create string literals with scm_c_substring_read_only instead of
2033    with scm_c_substring_copy.
2034 */
2035
2036 int
2037 scm_i_deprecated_stringp (SCM str)
2038 {
2039   scm_c_issue_deprecation_warning
2040     ("SCM_STRINGP is deprecated.  Use scm_is_string instead.");
2041
2042   return scm_is_string (str);
2043 }
2044
2045 char *
2046 scm_i_deprecated_string_chars (SCM str)
2047 {
2048   char *chars;
2049
2050   scm_c_issue_deprecation_warning
2051     ("SCM_STRING_CHARS is deprecated.  See the manual for alternatives.");
2052
2053   /* We don't accept shared substrings here since they are not
2054      null-terminated.
2055   */
2056   if (IS_SH_STRING (str))
2057     scm_misc_error (NULL,
2058                     "SCM_STRING_CHARS does not work with shared substrings",
2059                     SCM_EOL);
2060
2061   /* We explicitly test for read-only strings to produce a better
2062      error message.
2063   */
2064
2065   if (IS_RO_STRING (str))
2066     scm_misc_error (NULL,
2067                     "SCM_STRING_CHARS does not work with read-only strings",
2068                     SCM_EOL);
2069
2070   /* The following is still wrong, of course...
2071    */
2072   str = scm_i_string_start_writing (str);
2073   chars = scm_i_string_writable_chars (str);
2074   scm_i_string_stop_writing ();
2075   return chars;
2076 }
2077
2078 size_t
2079 scm_i_deprecated_string_length (SCM str)
2080 {
2081   scm_c_issue_deprecation_warning
2082     ("SCM_STRING_LENGTH is deprecated.  Use scm_c_string_length instead.");
2083   return scm_c_string_length (str);
2084 }
2085
2086 #endif
2087
2088 static SCM
2089 string_handle_ref (scm_t_array_handle *h, size_t index)
2090 {
2091   return scm_c_string_ref (h->array, index);
2092 }
2093
2094 static void
2095 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2096 {
2097   scm_c_string_set_x (h->array, index, val);
2098 }
2099
2100 static void
2101 string_get_handle (SCM v, scm_t_array_handle *h)
2102 {
2103   h->array = v;
2104   h->ndims = 1;
2105   h->dims = &h->dim0;
2106   h->dim0.lbnd = 0;
2107   h->dim0.ubnd = scm_c_string_length (v) - 1;
2108   h->dim0.inc = 1;
2109   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2110   h->elements = h->writable_elements = NULL;
2111 }
2112
2113 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2114                           string_handle_ref, string_handle_set,
2115                           string_get_handle)
2116 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2117
2118 void
2119 scm_init_strings ()
2120 {
2121   scm_nullstr = scm_i_make_string (0, NULL);
2122
2123 #include "libguile/strings.x"
2124 }
2125
2126
2127 /*
2128   Local Variables:
2129   c-file-style: "gnu"
2130   End:
2131 */