libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <string.h>
  26 #include <stdio.h>
  27 #include <ctype.h>
  28 #include <uninorm.h>
  29 #include <unistr.h>
  30 #include <uniconv.h>
  31
  32 #include "striconveh.h"
  33
  34 #include "libguile/_scm.h"
  35 #include "libguile/chars.h"
  36 #include "libguile/root.h"
  37 #include "libguile/strings.h"
  38 #include "libguile/error.h"
  39 #include "libguile/generalized-vectors.h"
  40 #include "libguile/deprecation.h"
  41 #include "libguile/validate.h"
  42 #include "libguile/private-options.h"
  43
  44 \f
  45
  46 /* {Strings}
  47  */
  48
  49
  50 /* Stringbufs
  51  *
  52  * XXX - keeping an accurate refcount during GC seems to be quite
  53  * tricky, so we just keep score of whether a stringbuf might be
  54  * shared, not whether it definitely is.
  55  *
  56  * The scheme I (mvo) tried to keep an accurate reference count would
  57  * recount all strings that point to a stringbuf during the mark-phase
  58  * of the GC.  This was done since one cannot access the stringbuf of
  59  * a string when that string is freed (in order to decrease the
  60  * reference count).  The memory of the stringbuf might have been
  61  * reused already for something completely different.
  62  *
  63  * This recounted worked for a small number of threads beating on
  64  * cow-strings, but it failed randomly with more than 10 threads, say.
  65  * I couldn't figure out what went wrong, so I used the conservative
  66  * approach implemented below.
  67  *
  68  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  69  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  70  * strings.
  71  */
  72
  73 /* The size in words of the stringbuf header (type tag + size).  */
  74 #define STRINGBUF_HEADER_SIZE   2U
  75
  76 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  77
  78 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  79 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  80
  81 #define STRINGBUF_TAG           scm_tc7_stringbuf
  82 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  83 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  84
  85 #define STRINGBUF_CHARS(buf)    ((unsigned char *)                      \
  86                                  SCM_CELL_OBJECT_LOC (buf,              \
  87                                                       STRINGBUF_HEADER_SIZE))
  88 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  89
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CHARS (buf))
  91
  92 #define SET_STRINGBUF_SHARED(buf)                                       \
  93   do                                                                    \
  94     {                                                                   \
  95       /* Don't modify BUF if it's already marked as shared since it might be \
  96          a read-only, statically allocated stringbuf.  */               \
  97       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
  98         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
  99     }                                                                   \
 100   while (0)
 101
 102 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 103 static size_t lenhist[1001];
 104 #endif
 105
 106 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 107    characters. */
 108 static SCM
 109 make_stringbuf (size_t len)
 110 {
 111   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 112      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 113      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 114      has been changed for scm_i_symbol_chars, this null-termination
 115      can be dropped.
 116   */
 117
 118   SCM buf;
 119
 120 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 121   if (len < 1000)
 122     lenhist[len]++;
 123   else
 124     lenhist[1000]++;
 125 #endif
 126
 127   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 128                                             "string"));
 129
 130   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 131   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 132
 133   STRINGBUF_CHARS (buf)[len] = 0;
 134
 135   return buf;
 136 }
 137
 138 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 139    characters.  */
 140 static SCM
 141 make_wide_stringbuf (size_t len)
 142 {
 143   SCM buf;
 144   size_t raw_len;
 145
 146 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 147   if (len < 1000)
 148     lenhist[len]++;
 149   else
 150     lenhist[1000]++;
 151 #endif
 152
 153   raw_len = (len + 1) * sizeof (scm_t_wchar);
 154   buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 155                                             "string"));
 156
 157   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 158   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 159
 160   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 161
 162   return buf;
 163 }
 164
 165 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 166    characters from BUF.  */
 167 static SCM
 168 wide_stringbuf (SCM buf)
 169 {
 170   SCM new_buf;
 171
 172   if (STRINGBUF_WIDE (buf))
 173     new_buf = buf;
 174   else
 175     {
 176       size_t i, len;
 177       scm_t_wchar *mem;
 178
 179       len = STRINGBUF_LENGTH (buf);
 180
 181       new_buf = make_wide_stringbuf (len);
 182
 183       mem = STRINGBUF_WIDE_CHARS (new_buf);
 184       for (i = 0; i < len; i++)
 185         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 186       mem[len] = 0;
 187     }
 188
 189   return new_buf;
 190 }
 191
 192 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 193    characters from BUF, if possible.  */
 194 static SCM
 195 narrow_stringbuf (SCM buf)
 196 {
 197   SCM new_buf;
 198
 199   if (!STRINGBUF_WIDE (buf))
 200     new_buf = buf;
 201   else
 202     {
 203       size_t i, len;
 204       scm_t_wchar *wmem;
 205       unsigned char *mem;
 206
 207       len = STRINGBUF_LENGTH (buf);
 208       wmem = STRINGBUF_WIDE_CHARS (buf);
 209
 210       for (i = 0; i < len; i++)
 211         if (wmem[i] > 0xFF)
 212           /* BUF cannot be narrowed.  */
 213           return buf;
 214
 215       new_buf = make_stringbuf (len);
 216
 217       mem = STRINGBUF_CHARS (new_buf);
 218       for (i = 0; i < len; i++)
 219         mem[i] = (unsigned char) wmem[i];
 220       mem[len] = 0;
 221     }
 222
 223   return new_buf;
 224 }
 225
 226 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 227
 228 \f
 229 /* Copy-on-write strings.
 230  */
 231
 232 #define STRING_TAG            scm_tc7_string
 233
 234 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 235 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 236 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 237
 238 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 239 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 240
 241 #define IS_STRING(str)        (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
 242
 243 /* Read-only strings.
 244  */
 245
 246 #define RO_STRING_TAG         scm_tc7_ro_string
 247 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 248
 249 /* Mutation-sharing substrings
 250  */
 251
 252 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 253
 254 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 255 /* START and LENGTH as for STRINGs. */
 256
 257 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 258
 259 SCM scm_nullstr;
 260
 261 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 262    characters.  CHARSP, if not NULL, will be set to location of the
 263    char array.  */
 264 SCM
 265 scm_i_make_string (size_t len, char **charsp)
 266 {
 267   SCM buf = make_stringbuf (len);
 268   SCM res;
 269   if (charsp)
 270     *charsp = (char *) STRINGBUF_CHARS (buf);
 271   res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 272                          (scm_t_bits)0, (scm_t_bits) len);
 273   return res;
 274 }
 275
 276 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 277    characters.  CHARSP, if not NULL, will be set to location of the
 278    character array.  */
 279 SCM
 280 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
 281 {
 282   SCM buf = make_wide_stringbuf (len);
 283   SCM res;
 284   if (charsp)
 285     *charsp = STRINGBUF_WIDE_CHARS (buf);
 286   res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
 287                          (scm_t_bits) 0, (scm_t_bits) len);
 288   return res;
 289 }
 290
 291 static void
 292 validate_substring_args (SCM str, size_t start, size_t end)
 293 {
 294   if (!IS_STRING (str))
 295     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 296   if (start > STRING_LENGTH (str))
 297     scm_out_of_range (NULL, scm_from_size_t (start));
 298   if (end > STRING_LENGTH (str) || end < start)
 299     scm_out_of_range (NULL, scm_from_size_t (end));
 300 }
 301
 302 static inline void
 303 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 304 {
 305   *start = STRING_START (*str);
 306   if (IS_SH_STRING (*str))
 307     {
 308       *str = SH_STRING_STRING (*str);
 309       *start += STRING_START (*str);
 310     }
 311   *buf = STRING_STRINGBUF (*str);
 312 }
 313
 314 SCM
 315 scm_i_substring (SCM str, size_t start, size_t end)
 316 {
 317   SCM buf;
 318   size_t str_start;
 319   get_str_buf_start (&str, &buf, &str_start);
 320   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 321   SET_STRINGBUF_SHARED (buf);
 322   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 323   return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 324                           (scm_t_bits)str_start + start,
 325                           (scm_t_bits) end - start);
 326 }
 327
 328 SCM
 329 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 330 {
 331   SCM buf;
 332   size_t str_start;
 333   get_str_buf_start (&str, &buf, &str_start);
 334   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 335   SET_STRINGBUF_SHARED (buf);
 336   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 337   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 338                           (scm_t_bits)str_start + start,
 339                           (scm_t_bits) end - start);
 340 }
 341
 342 SCM
 343 scm_i_substring_copy (SCM str, size_t start, size_t end)
 344 {
 345   size_t len = end - start;
 346   SCM buf, my_buf;
 347   size_t str_start;
 348   get_str_buf_start (&str, &buf, &str_start);
 349   if (scm_i_is_narrow_string (str))
 350     {
 351       my_buf = make_stringbuf (len);
 352       memcpy (STRINGBUF_CHARS (my_buf),
 353               STRINGBUF_CHARS (buf) + str_start + start, len);
 354     }
 355   else
 356     {
 357       my_buf = make_wide_stringbuf (len);
 358       u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 359                (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 360                                  + start), len);
 361       /* Even though this string is wide, the substring may be narrow.
 362          Consider adding code to narrow the string.  */
 363     }
 364   scm_remember_upto_here_1 (buf);
 365   return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 366                           (scm_t_bits) 0, (scm_t_bits) len);
 367 }
 368
 369 SCM
 370 scm_i_substring_shared (SCM str, size_t start, size_t end)
 371 {
 372   if (start == 0 && end == STRING_LENGTH (str))
 373     return str;
 374   else
 375     {
 376       size_t len = end - start;
 377       if (IS_SH_STRING (str))
 378         {
 379           start += STRING_START (str);
 380           str = SH_STRING_STRING (str);
 381         }
 382       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 383                               (scm_t_bits)start, (scm_t_bits) len);
 384     }
 385 }
 386
 387 SCM
 388 scm_c_substring (SCM str, size_t start, size_t end)
 389 {
 390   validate_substring_args (str, start, end);
 391   return scm_i_substring (str, start, end);
 392 }
 393
 394 SCM
 395 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 396 {
 397   validate_substring_args (str, start, end);
 398   return scm_i_substring_read_only (str, start, end);
 399 }
 400
 401 SCM
 402 scm_c_substring_copy (SCM str, size_t start, size_t end)
 403 {
 404   validate_substring_args (str, start, end);
 405   return scm_i_substring_copy (str, start, end);
 406 }
 407
 408 SCM
 409 scm_c_substring_shared (SCM str, size_t start, size_t end)
 410 {
 411   validate_substring_args (str, start, end);
 412   return scm_i_substring_shared (str, start, end);
 413 }
 414
 415 \f
 416 /* Internal accessors
 417  */
 418
 419 /* Returns the number of characters in STR.  This may be different
 420    than the memory size of the string storage.  */
 421 size_t
 422 scm_i_string_length (SCM str)
 423 {
 424   return STRING_LENGTH (str);
 425 }
 426
 427 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 428    encoding.  False if it is 'wide', having a 32-bit UCS-4
 429    encoding.  */
 430 int
 431 scm_i_is_narrow_string (SCM str)
 432 {
 433   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 434 }
 435
 436 /* Try to coerce a string to be narrow.  It if is narrow already, do
 437    nothing.  If it is wide, shrink it to narrow if none of its
 438    characters are above 0xFF.  Return true if the string is narrow or
 439    was made to be narrow.  */
 440 int
 441 scm_i_try_narrow_string (SCM str)
 442 {
 443   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 444
 445   return scm_i_is_narrow_string (str);
 446 }
 447
 448 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 449    STR.  */
 450 const char *
 451 scm_i_string_chars (SCM str)
 452 {
 453   SCM buf;
 454   size_t start;
 455   get_str_buf_start (&str, &buf, &start);
 456   if (scm_i_is_narrow_string (str))
 457     return (const char *) STRINGBUF_CHARS (buf) + start;
 458   else
 459     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 460                     scm_list_1 (str));
 461   return NULL;
 462 }
 463
 464 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 465    STR.  */
 466 const scm_t_wchar *
 467 scm_i_string_wide_chars (SCM str)
 468 {
 469   SCM buf;
 470   size_t start;
 471
 472   get_str_buf_start (&str, &buf, &start);
 473   if (!scm_i_is_narrow_string (str))
 474     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 475   else
 476     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 477                     scm_list_1 (str));
 478 }
 479
 480 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 481    a new string buffer, so that it can be modified without modifying
 482    other strings.  Also, lock the string mutex.  Later, one must call
 483    scm_i_string_stop_writing to unlock the mutex.  */
 484 SCM
 485 scm_i_string_start_writing (SCM orig_str)
 486 {
 487   SCM buf, str = orig_str;
 488   size_t start;
 489
 490   get_str_buf_start (&str, &buf, &start);
 491   if (IS_RO_STRING (str))
 492     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 493
 494   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 495   if (STRINGBUF_SHARED (buf))
 496     {
 497       /* Clone the stringbuf.  */
 498       size_t len = STRING_LENGTH (str);
 499       SCM new_buf;
 500
 501       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 502
 503       if (scm_i_is_narrow_string (str))
 504         {
 505           new_buf = make_stringbuf (len);
 506           memcpy (STRINGBUF_CHARS (new_buf),
 507                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 508
 509         }
 510       else
 511         {
 512           new_buf = make_wide_stringbuf (len);
 513           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 514                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 515                                      + STRING_START (str)), len);
 516         }
 517
 518       SET_STRING_STRINGBUF (str, new_buf);
 519       start -= STRING_START (str);
 520
 521       /* FIXME: The following operations are not atomic, so other threads
 522          looking at STR may see an inconsistent state.  Nevertheless it can't
 523          hurt much since (i) accessing STR while it is being mutated can't
 524          yield a crash, and (ii) concurrent accesses to STR should be
 525          protected by a mutex at the application level.  The latter may not
 526          apply when STR != ORIG_STR, though.  */
 527       SET_STRING_START (str, 0);
 528       SET_STRING_STRINGBUF (str, new_buf);
 529
 530       buf = new_buf;
 531
 532       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 533     }
 534   return orig_str;
 535 }
 536
 537 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 538 char *
 539 scm_i_string_writable_chars (SCM str)
 540 {
 541   SCM buf;
 542   size_t start;
 543
 544   get_str_buf_start (&str, &buf, &start);
 545   if (scm_i_is_narrow_string (str))
 546     return (char *) STRINGBUF_CHARS (buf) + start;
 547   else
 548     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 549                     scm_list_1 (str));
 550   return NULL;
 551 }
 552
 553 /* Return a pointer to the UCS-4 codepoints of a string.  */
 554 static scm_t_wchar *
 555 scm_i_string_writable_wide_chars (SCM str)
 556 {
 557   SCM buf;
 558   size_t start;
 559
 560   get_str_buf_start (&str, &buf, &start);
 561   if (!scm_i_is_narrow_string (str))
 562     return STRINGBUF_WIDE_CHARS (buf) + start;
 563   else
 564     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 565                     scm_list_1 (str));
 566 }
 567
 568 /* Unlock the string mutex that was locked when
 569    scm_i_string_start_writing was called.  */
 570 void
 571 scm_i_string_stop_writing (void)
 572 {
 573   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 574 }
 575
 576 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 577 scm_t_wchar
 578 scm_i_string_ref (SCM str, size_t x)
 579 {
 580   if (scm_i_is_narrow_string (str))
 581     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 582   else
 583     return scm_i_string_wide_chars (str)[x];
 584 }
 585
 586 /* Returns index+1 of the first char in STR that matches C, or
 587    0 if the char is not found.  */
 588 int
 589 scm_i_string_contains_char (SCM str, char ch)
 590 {
 591   size_t i;
 592   size_t len = scm_i_string_length (str);
 593
 594   i = 0;
 595   if (scm_i_is_narrow_string (str))
 596     {
 597       while (i < len)
 598         {
 599           if (scm_i_string_chars (str)[i] == ch)
 600             return i+1;
 601           i++;
 602         }
 603     }
 604   else
 605     {
 606       while (i < len)
 607         {
 608           if (scm_i_string_wide_chars (str)[i]
 609               == (unsigned char) ch)
 610             return i+1;
 611           i++;
 612         }
 613     }
 614   return 0;
 615 }
 616
 617 int
 618 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 619 {
 620   if (scm_i_is_narrow_string (sstr))
 621     {
 622       const char *a = scm_i_string_chars (sstr) + start_x;
 623       const char *b = cstr;
 624       return strncmp (a, b, strlen(b));
 625     }
 626   else
 627     {
 628       size_t i;
 629       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 630       const char *b = cstr;
 631       for (i = 0; i < strlen (b); i++)
 632         {
 633           if (a[i] != (unsigned char) b[i])
 634             return 1;
 635         }
 636     }
 637   return 0;
 638 }
 639
 640 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 641 void
 642 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 643 {
 644   if (chr > 0xFF && scm_i_is_narrow_string (str))
 645     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 646
 647   if (scm_i_is_narrow_string (str))
 648     {
 649       char *dst = scm_i_string_writable_chars (str);
 650       dst[p] = chr;
 651     }
 652   else
 653     {
 654       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 655       dst[p] = chr;
 656     }
 657 }
 658
 659 \f
 660 /* Symbols.
 661
 662    Basic symbol creation and accessing is done here, the rest is in
 663    symbols.[hc].  This has been done to keep stringbufs and the
 664    internals of strings and string-like objects confined to this file.
 665 */
 666
 667 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 668
 669 SCM
 670 scm_i_make_symbol (SCM name, scm_t_bits flags,
 671                    unsigned long hash, SCM props)
 672 {
 673   SCM buf;
 674   size_t start = STRING_START (name);
 675   size_t length = STRING_LENGTH (name);
 676
 677   if (IS_SH_STRING (name))
 678     {
 679       name = SH_STRING_STRING (name);
 680       start += STRING_START (name);
 681     }
 682   buf = SYMBOL_STRINGBUF (name);
 683
 684   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 685     {
 686       /* reuse buf. */
 687       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 688       SET_STRINGBUF_SHARED (buf);
 689       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 690     }
 691   else
 692     {
 693       /* make new buf. */
 694       if (scm_i_is_narrow_string (name))
 695         {
 696           SCM new_buf = make_stringbuf (length);
 697           memcpy (STRINGBUF_CHARS (new_buf),
 698                   STRINGBUF_CHARS (buf) + start, length);
 699           buf = new_buf;
 700         }
 701       else
 702         {
 703           SCM new_buf = make_wide_stringbuf (length);
 704           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 705                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 706                    length);
 707           buf = new_buf;
 708         }
 709     }
 710   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 711                           (scm_t_bits) hash, SCM_UNPACK (props));
 712 }
 713
 714 SCM
 715 scm_i_c_make_symbol (const char *name, size_t len,
 716                      scm_t_bits flags, unsigned long hash, SCM props)
 717 {
 718   SCM buf = make_stringbuf (len);
 719   memcpy (STRINGBUF_CHARS (buf), name, len);
 720
 721   return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 722                                     (scm_t_bits) hash, SCM_UNPACK (props));
 723 }
 724
 725 /* Returns the number of characters in SYM.  This may be different
 726    from the memory size of SYM.  */
 727 size_t
 728 scm_i_symbol_length (SCM sym)
 729 {
 730   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 731 }
 732
 733 size_t
 734 scm_c_symbol_length (SCM sym)
 735 #define FUNC_NAME "scm_c_symbol_length"
 736 {
 737   SCM_VALIDATE_SYMBOL (1, sym);
 738
 739   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 740 }
 741 #undef FUNC_NAME
 742
 743 /* True if the name of SYM is stored as a Latin-1 encoded string.
 744    False if it is stored as a 32-bit UCS-4-encoded string.  */
 745 int
 746 scm_i_is_narrow_symbol (SCM sym)
 747 {
 748   SCM buf;
 749
 750   buf = SYMBOL_STRINGBUF (sym);
 751   return !STRINGBUF_WIDE (buf);
 752 }
 753
 754 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 755    contains the name of SYM.  */
 756 const char *
 757 scm_i_symbol_chars (SCM sym)
 758 {
 759   SCM buf;
 760
 761   buf = SYMBOL_STRINGBUF (sym);
 762   if (!STRINGBUF_WIDE (buf))
 763     return (const char *) STRINGBUF_CHARS (buf);
 764   else
 765     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 766                     scm_list_1 (sym));
 767 }
 768
 769 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 770    symbol's name.  */
 771 const scm_t_wchar *
 772 scm_i_symbol_wide_chars (SCM sym)
 773 {
 774   SCM buf;
 775
 776   buf = SYMBOL_STRINGBUF (sym);
 777   if (STRINGBUF_WIDE (buf))
 778     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 779   else
 780     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 781                     scm_list_1 (sym));
 782 }
 783
 784 SCM
 785 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 786 {
 787   SCM buf = SYMBOL_STRINGBUF (sym);
 788   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 789   SET_STRINGBUF_SHARED (buf);
 790   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 791   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 792                           (scm_t_bits)start, (scm_t_bits) end - start);
 793 }
 794
 795 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 796 scm_t_wchar
 797 scm_i_symbol_ref (SCM sym, size_t x)
 798 {
 799   if (scm_i_is_narrow_symbol (sym))
 800     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 801   else
 802     return scm_i_symbol_wide_chars (sym)[x];
 803 }
 804
 805 /* Debugging
 806  */
 807
 808 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 809             "Returns an association list containing debugging information\n"
 810             "for @var{str}. The association list has the following entries."
 811             "@table @code\n"
 812             "@item string\n"
 813             "The string itself.\n"
 814             "@item start\n"
 815             "The start index of the string into its stringbuf\n"
 816             "@item length\n"
 817             "The length of the string\n"
 818             "@item shared\n"
 819             "If this string is a substring, it returns its parent string.\n"
 820             "Otherwise, it returns @code{#f}\n"
 821             "@item read-only\n"
 822             "@code{#t} if the string is read-only\n"
 823             "@item stringbuf-chars\n"
 824             "A new string containing this string's stringbuf's characters\n"
 825             "@item stringbuf-length\n"
 826             "The number of characters in this stringbuf\n"
 827             "@item stringbuf-shared\n"
 828             "@code{#t} if this stringbuf is shared\n"
 829             "@item stringbuf-wide\n"
 830             "@code{#t} if this stringbuf's characters are stored in a\n"
 831             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 832             "buffer\n"
 833             "@end table")
 834 #define FUNC_NAME s_scm_sys_string_dump
 835 {
 836   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 837   SCM buf;
 838   SCM_VALIDATE_STRING (1, str);
 839
 840   /* String info */
 841   e1 = scm_cons (scm_from_locale_symbol ("string"),
 842                  str);
 843   e2 = scm_cons (scm_from_locale_symbol ("start"),
 844                  scm_from_size_t (STRING_START (str)));
 845   e3 = scm_cons (scm_from_locale_symbol ("length"),
 846                  scm_from_size_t (STRING_LENGTH (str)));
 847
 848   if (IS_SH_STRING (str))
 849     {
 850       e4 = scm_cons (scm_from_locale_symbol ("shared"),
 851                      SH_STRING_STRING (str));
 852       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 853     }
 854   else
 855     {
 856       e4 = scm_cons (scm_from_locale_symbol ("shared"),
 857                      SCM_BOOL_F);
 858       buf = STRING_STRINGBUF (str);
 859     }
 860
 861   if (IS_RO_STRING (str))
 862     e5 = scm_cons (scm_from_locale_symbol ("read-only"),
 863                    SCM_BOOL_T);
 864   else
 865     e5 = scm_cons (scm_from_locale_symbol ("read-only"),
 866                    SCM_BOOL_F);
 867
 868   /* Stringbuf info */
 869   if (!STRINGBUF_WIDE (buf))
 870     {
 871       size_t len = STRINGBUF_LENGTH (buf);
 872       char *cbuf;
 873       SCM sbc = scm_i_make_string (len, &cbuf);
 874       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 875       e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 876                      sbc);
 877     }
 878   else
 879     {
 880       size_t len = STRINGBUF_LENGTH (buf);
 881       scm_t_wchar *cbuf;
 882       SCM sbc = scm_i_make_wide_string (len, &cbuf);
 883       u32_cpy ((scm_t_uint32 *) cbuf,
 884                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 885       e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 886                      sbc);
 887     }
 888   e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
 889                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 890   if (STRINGBUF_SHARED (buf))
 891     e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 892                    SCM_BOOL_T);
 893   else
 894     e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 895                    SCM_BOOL_F);
 896   if (STRINGBUF_WIDE (buf))
 897     e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 898                    SCM_BOOL_T);
 899   else
 900     e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 901                    SCM_BOOL_F);
 902
 903   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 904 }
 905 #undef FUNC_NAME
 906
 907 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 908             "Returns an association list containing debugging information\n"
 909             "for @var{sym}. The association list has the following entries."
 910             "@table @code\n"
 911             "@item symbol\n"
 912             "The symbol itself\n"
 913             "@item hash\n"
 914             "Its hash value\n"
 915             "@item interned\n"
 916             "@code{#t} if it is an interned symbol\n"
 917             "@item stringbuf-chars\n"
 918             "A new string containing this symbols's stringbuf's characters\n"
 919             "@item stringbuf-length\n"
 920             "The number of characters in this stringbuf\n"
 921             "@item stringbuf-shared\n"
 922             "@code{#t} if this stringbuf is shared\n"
 923             "@item stringbuf-wide\n"
 924             "@code{#t} if this stringbuf's characters are stored in a\n"
 925             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 926             "buffer\n"
 927             "@end table")
 928 #define FUNC_NAME s_scm_sys_symbol_dump
 929 {
 930   SCM e1, e2, e3, e4, e5, e6, e7;
 931   SCM buf;
 932   SCM_VALIDATE_SYMBOL (1, sym);
 933   e1 = scm_cons (scm_from_locale_symbol ("symbol"),
 934                  sym);
 935   e2 = scm_cons (scm_from_locale_symbol ("hash"),
 936                  scm_from_ulong (scm_i_symbol_hash (sym)));
 937   e3 = scm_cons (scm_from_locale_symbol ("interned"),
 938                  scm_symbol_interned_p (sym));
 939   buf = SYMBOL_STRINGBUF (sym);
 940
 941   /* Stringbuf info */
 942   if (!STRINGBUF_WIDE (buf))
 943     {
 944       size_t len = STRINGBUF_LENGTH (buf);
 945       char *cbuf;
 946       SCM sbc = scm_i_make_string (len, &cbuf);
 947       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 948       e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 949                      sbc);
 950     }
 951   else
 952     {
 953       size_t len = STRINGBUF_LENGTH (buf);
 954       scm_t_wchar *cbuf;
 955       SCM sbc = scm_i_make_wide_string (len, &cbuf);
 956       u32_cpy ((scm_t_uint32 *) cbuf,
 957                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 958       e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
 959                      sbc);
 960     }
 961   e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
 962                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 963   if (STRINGBUF_SHARED (buf))
 964     e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 965                    SCM_BOOL_T);
 966   else
 967     e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
 968                    SCM_BOOL_F);
 969   if (STRINGBUF_WIDE (buf))
 970     e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 971                     SCM_BOOL_T);
 972   else
 973     e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
 974                     SCM_BOOL_F);
 975   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
 976
 977 }
 978 #undef FUNC_NAME
 979
 980 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 981
 982 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
 983 #define FUNC_NAME s_scm_sys_stringbuf_hist
 984 {
 985   int i;
 986   for (i = 0; i < 1000; i++)
 987     if (lenhist[i])
 988       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
 989   fprintf (stderr, ">999: %u\n", lenhist[1000]);
 990   return SCM_UNSPECIFIED;
 991 }
 992 #undef FUNC_NAME
 993
 994 #endif
 995
 996 \f
 997
 998 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
 999             (SCM obj),
1000             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1001 #define FUNC_NAME s_scm_string_p
1002 {
1003   return scm_from_bool (IS_STRING (obj));
1004 }
1005 #undef FUNC_NAME
1006
1007
1008 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1009
1010 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1011             (SCM chrs),
1012             "@deffnx {Scheme Procedure} list->string chrs\n"
1013             "Return a newly allocated string composed of the arguments,\n"
1014             "@var{chrs}.")
1015 #define FUNC_NAME s_scm_string
1016 {
1017   SCM result = SCM_BOOL_F;
1018   SCM rest;
1019   size_t len;
1020   size_t p = 0;
1021   long i;
1022   int wide = 0;
1023
1024   /* Verify that this is a list of chars.  */
1025   i = scm_ilength (chrs);
1026   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1027
1028   len = (size_t) i;
1029   rest = chrs;
1030
1031   while (len > 0 && scm_is_pair (rest))
1032     {
1033       SCM elt = SCM_CAR (rest);
1034       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1035       if (SCM_CHAR (elt) > 0xFF)
1036         wide = 1;
1037       rest = SCM_CDR (rest);
1038       len--;
1039       scm_remember_upto_here_1 (elt);
1040     }
1041
1042   /* Construct a string containing this list of chars.  */
1043   len = (size_t) i;
1044   rest = chrs;
1045
1046   if (wide == 0)
1047     {
1048       char *buf;
1049
1050       result = scm_i_make_string (len, NULL);
1051       result = scm_i_string_start_writing (result);
1052       buf = scm_i_string_writable_chars (result);
1053       while (len > 0 && scm_is_pair (rest))
1054         {
1055           SCM elt = SCM_CAR (rest);
1056           buf[p] = (unsigned char) SCM_CHAR (elt);
1057           p++;
1058           rest = SCM_CDR (rest);
1059           len--;
1060           scm_remember_upto_here_1 (elt);
1061         }
1062     }
1063   else
1064     {
1065       scm_t_wchar *buf;
1066
1067       result = scm_i_make_wide_string (len, NULL);
1068       result = scm_i_string_start_writing (result);
1069       buf = scm_i_string_writable_wide_chars (result);
1070       while (len > 0 && scm_is_pair (rest))
1071         {
1072           SCM elt = SCM_CAR (rest);
1073           buf[p] = SCM_CHAR (elt);
1074           p++;
1075           rest = SCM_CDR (rest);
1076           len--;
1077           scm_remember_upto_here_1 (elt);
1078         }
1079     }
1080   scm_i_string_stop_writing ();
1081
1082   if (len > 0)
1083     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1084   if (!scm_is_null (rest))
1085     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1086
1087   return result;
1088 }
1089 #undef FUNC_NAME
1090
1091 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1092             (SCM k, SCM chr),
1093             "Return a newly allocated string of\n"
1094             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1095             "the string are initialized to @var{chr}, otherwise the contents\n"
1096             "of the @var{string} are unspecified.")
1097 #define FUNC_NAME s_scm_make_string
1098 {
1099   return scm_c_make_string (scm_to_size_t (k), chr);
1100 }
1101 #undef FUNC_NAME
1102
1103 SCM
1104 scm_c_make_string (size_t len, SCM chr)
1105 #define FUNC_NAME NULL
1106 {
1107   size_t p;
1108   SCM res = scm_i_make_string (len, NULL);
1109
1110   if (!SCM_UNBNDP (chr))
1111     {
1112       SCM_VALIDATE_CHAR (0, chr);
1113       res = scm_i_string_start_writing (res);
1114       for (p = 0; p < len; p++)
1115         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1116       scm_i_string_stop_writing ();
1117     }
1118
1119   return res;
1120 }
1121 #undef FUNC_NAME
1122
1123 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1124             (SCM string),
1125             "Return the number of characters in @var{string}.")
1126 #define FUNC_NAME s_scm_string_length
1127 {
1128   SCM_VALIDATE_STRING (1, string);
1129   return scm_from_size_t (STRING_LENGTH (string));
1130 }
1131 #undef FUNC_NAME
1132
1133 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1134             (SCM string),
1135             "Return the bytes used to represent a character in @var{string}."
1136             "This will return 1 or 4.")
1137 #define FUNC_NAME s_scm_string_bytes_per_char
1138 {
1139   SCM_VALIDATE_STRING (1, string);
1140   if (!scm_i_is_narrow_string (string))
1141     return scm_from_int (4);
1142
1143   return scm_from_int (1);
1144 }
1145 #undef FUNC_NAME
1146
1147 size_t
1148 scm_c_string_length (SCM string)
1149 {
1150   if (!IS_STRING (string))
1151     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1152   return STRING_LENGTH (string);
1153 }
1154
1155 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1156             (SCM str, SCM k),
1157             "Return character @var{k} of @var{str} using zero-origin\n"
1158             "indexing. @var{k} must be a valid index of @var{str}.")
1159 #define FUNC_NAME s_scm_string_ref
1160 {
1161   size_t len;
1162   unsigned long idx;
1163
1164   SCM_VALIDATE_STRING (1, str);
1165
1166   len = scm_i_string_length (str);
1167   if (SCM_LIKELY (len > 0))
1168     idx = scm_to_unsigned_integer (k, 0, len - 1);
1169   else
1170     scm_out_of_range (NULL, k);
1171
1172   if (scm_i_is_narrow_string (str))
1173     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1174   else
1175     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1176 }
1177 #undef FUNC_NAME
1178
1179 SCM
1180 scm_c_string_ref (SCM str, size_t p)
1181 {
1182   if (p >= scm_i_string_length (str))
1183     scm_out_of_range (NULL, scm_from_size_t (p));
1184   if (scm_i_is_narrow_string (str))
1185     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1186   else
1187     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1188
1189 }
1190
1191 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1192             (SCM str, SCM k, SCM chr),
1193             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1194             "an unspecified value. @var{k} must be a valid index of\n"
1195             "@var{str}.")
1196 #define FUNC_NAME s_scm_string_set_x
1197 {
1198   size_t len;
1199   unsigned long idx;
1200
1201   SCM_VALIDATE_STRING (1, str);
1202
1203   len = scm_i_string_length (str);
1204   if (SCM_LIKELY (len > 0))
1205     idx = scm_to_unsigned_integer (k, 0, len - 1);
1206   else
1207     scm_out_of_range (NULL, k);
1208
1209   SCM_VALIDATE_CHAR (3, chr);
1210   str = scm_i_string_start_writing (str);
1211   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1212   scm_i_string_stop_writing ();
1213
1214   return SCM_UNSPECIFIED;
1215 }
1216 #undef FUNC_NAME
1217
1218 void
1219 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1220 {
1221   if (p >= scm_i_string_length (str))
1222     scm_out_of_range (NULL, scm_from_size_t (p));
1223   str = scm_i_string_start_writing (str);
1224   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1225   scm_i_string_stop_writing ();
1226 }
1227
1228 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1229             (SCM str, SCM start, SCM end),
1230             "Return a newly allocated string formed from the characters\n"
1231             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1232             "ending with index @var{end} (exclusive).\n"
1233             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1234             "exact integers satisfying:\n\n"
1235             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1236 #define FUNC_NAME s_scm_substring
1237 {
1238   size_t len, from, to;
1239
1240   SCM_VALIDATE_STRING (1, str);
1241   len = scm_i_string_length (str);
1242   from = scm_to_unsigned_integer (start, 0, len);
1243   if (SCM_UNBNDP (end))
1244     to = len;
1245   else
1246     to = scm_to_unsigned_integer (end, from, len);
1247   return scm_i_substring (str, from, to);
1248 }
1249 #undef FUNC_NAME
1250
1251 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1252             (SCM str, SCM start, SCM end),
1253             "Return a newly allocated string formed from the characters\n"
1254             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1255             "ending with index @var{end} (exclusive).\n"
1256             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1257             "exact integers satisfying:\n"
1258             "\n"
1259             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1260             "\n"
1261             "The returned string is read-only.\n")
1262 #define FUNC_NAME s_scm_substring_read_only
1263 {
1264   size_t len, from, to;
1265
1266   SCM_VALIDATE_STRING (1, str);
1267   len = scm_i_string_length (str);
1268   from = scm_to_unsigned_integer (start, 0, len);
1269   if (SCM_UNBNDP (end))
1270     to = len;
1271   else
1272     to = scm_to_unsigned_integer (end, from, len);
1273   return scm_i_substring_read_only (str, from, to);
1274 }
1275 #undef FUNC_NAME
1276
1277 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1278             (SCM str, SCM start, SCM end),
1279             "Return a newly allocated string formed from the characters\n"
1280             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1281             "ending with index @var{end} (exclusive).\n"
1282             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1283             "exact integers satisfying:\n\n"
1284             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1285 #define FUNC_NAME s_scm_substring_copy
1286 {
1287   /* For the Scheme version, START is mandatory, but for the C
1288      version, it is optional.  See scm_string_copy in srfi-13.c for a
1289      rationale.
1290   */
1291
1292   size_t from, to;
1293
1294   SCM_VALIDATE_STRING (1, str);
1295   scm_i_get_substring_spec (scm_i_string_length (str),
1296                             start, &from, end, &to);
1297   return scm_i_substring_copy (str, from, to);
1298 }
1299 #undef FUNC_NAME
1300
1301 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1302             (SCM str, SCM start, SCM end),
1303             "Return string that indirectly refers to the characters\n"
1304             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1305             "ending with index @var{end} (exclusive).\n"
1306             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1307             "exact integers satisfying:\n\n"
1308             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1309 #define FUNC_NAME s_scm_substring_shared
1310 {
1311   size_t len, from, to;
1312
1313   SCM_VALIDATE_STRING (1, str);
1314   len = scm_i_string_length (str);
1315   from = scm_to_unsigned_integer (start, 0, len);
1316   if (SCM_UNBNDP (end))
1317     to = len;
1318   else
1319     to = scm_to_unsigned_integer (end, from, len);
1320   return scm_i_substring_shared (str, from, to);
1321 }
1322 #undef FUNC_NAME
1323
1324 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1325             (SCM args),
1326             "Return a newly allocated string whose characters form the\n"
1327             "concatenation of the given strings, @var{args}.")
1328 #define FUNC_NAME s_scm_string_append
1329 {
1330   SCM res;
1331   size_t len = 0;
1332   int wide = 0;
1333   SCM l, s;
1334   size_t i;
1335   union
1336   {
1337     char *narrow;
1338     scm_t_wchar *wide;
1339   } data;
1340
1341   SCM_VALIDATE_REST_ARGUMENT (args);
1342   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1343     {
1344       s = SCM_CAR (l);
1345       SCM_VALIDATE_STRING (SCM_ARGn, s);
1346       len += scm_i_string_length (s);
1347       if (!scm_i_is_narrow_string (s))
1348         wide = 1;
1349     }
1350   data.narrow = NULL;
1351   if (!wide)
1352     res = scm_i_make_string (len, &data.narrow);
1353   else
1354     res = scm_i_make_wide_string (len, &data.wide);
1355
1356   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1357     {
1358       size_t len;
1359       s = SCM_CAR (l);
1360       SCM_VALIDATE_STRING (SCM_ARGn, s);
1361       len = scm_i_string_length (s);
1362       if (!wide)
1363         {
1364           memcpy (data.narrow, scm_i_string_chars (s), len);
1365           data.narrow += len;
1366         }
1367       else
1368         {
1369           if (scm_i_is_narrow_string (s))
1370             {
1371               for (i = 0; i < scm_i_string_length (s); i++)
1372                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1373             }
1374           else
1375             u32_cpy ((scm_t_uint32 *) data.wide,
1376                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1377           data.wide += len;
1378         }
1379       scm_remember_upto_here_1 (s);
1380     }
1381   return res;
1382 }
1383 #undef FUNC_NAME
1384
1385 int
1386 scm_is_string (SCM obj)
1387 {
1388   return IS_STRING (obj);
1389 }
1390
1391 \f
1392 /* Conversion to/from other encodings.  */
1393
1394 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1395 static void
1396 scm_encoding_error (const char *subr, int err, const char *message,
1397                     const char *from, const char *to, SCM string_or_bv)
1398 {
1399   /* Raise an exception that conveys all the information needed to debug the
1400      problem.  Only perform locale conversions that are safe; in particular,
1401      don't try to display STRING_OR_BV when it's a string since converting it to
1402      the output locale may fail.  */
1403   scm_throw (scm_encoding_error_key,
1404              scm_list_n (scm_from_locale_string (subr),
1405                          scm_from_locale_string (message),
1406                          scm_from_int (err),
1407                          scm_from_locale_string (from),
1408                          scm_from_locale_string (to),
1409                          string_or_bv,
1410                          SCM_UNDEFINED));
1411 }
1412
1413 SCM
1414 scm_from_stringn (const char *str, size_t len, const char *encoding,
1415                   scm_t_string_failed_conversion_handler handler)
1416 {
1417   size_t u32len, i;
1418   scm_t_wchar *u32;
1419   int wide = 0;
1420   SCM res;
1421
1422   if (len == 0)
1423     return scm_nullstr;
1424
1425   if (encoding == NULL)
1426     {
1427       /* If encoding is null, use Latin-1.  */
1428       char *buf;
1429       res = scm_i_make_string (len, &buf);
1430       memcpy (buf, str, len);
1431       return res;
1432     }
1433
1434   u32len = 0;
1435   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1436                                                 (enum iconv_ilseq_handler)
1437                                                 handler,
1438                                                 str, len,
1439                                                 NULL,
1440                                                 NULL, &u32len);
1441
1442   if (SCM_UNLIKELY (u32 == NULL))
1443     {
1444       /* Raise an error and pass the raw C string as a bytevector to the `throw'
1445          handler.  */
1446       SCM bv;
1447       signed char *buf;
1448
1449       buf = scm_gc_malloc_pointerless (len, "bytevector");
1450       memcpy (buf, str, len);
1451       bv = scm_c_take_bytevector (buf, len);
1452
1453       scm_encoding_error (__func__, errno,
1454                           "input locale conversion error",
1455                           encoding, "UTF-32", bv);
1456     }
1457
1458   i = 0;
1459   while (i < u32len)
1460     if (u32[i++] > 0xFF)
1461       {
1462         wide = 1;
1463         break;
1464       }
1465
1466   if (!wide)
1467     {
1468       char *dst;
1469       res = scm_i_make_string (u32len, &dst);
1470       for (i = 0; i < u32len; i ++)
1471         dst[i] = (unsigned char) u32[i];
1472       dst[u32len] = '\0';
1473     }
1474   else
1475     {
1476       scm_t_wchar *wdst;
1477       res = scm_i_make_wide_string (u32len, &wdst);
1478       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1479       wdst[u32len] = 0;
1480     }
1481
1482   free (u32);
1483   return res;
1484 }
1485
1486 SCM
1487 scm_from_locale_stringn (const char *str, size_t len)
1488 {
1489   const char *enc;
1490   scm_t_string_failed_conversion_handler hndl;
1491   SCM inport;
1492   scm_t_port *pt;
1493
1494   if (len == (size_t) -1)
1495     len = strlen (str);
1496   if (len == 0)
1497     return scm_nullstr;
1498
1499   inport = scm_current_input_port ();
1500   if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1501     {
1502       pt = SCM_PTAB_ENTRY (inport);
1503       enc = pt->encoding;
1504       hndl = pt->ilseq_handler;
1505     }
1506   else
1507     {
1508       enc = NULL;
1509       hndl = SCM_FAILED_CONVERSION_ERROR;
1510     }
1511
1512   return scm_from_stringn (str, len, enc, hndl);
1513 }
1514
1515 SCM
1516 scm_from_locale_string (const char *str)
1517 {
1518   if (str == NULL)
1519     return scm_nullstr;
1520
1521   return scm_from_locale_stringn (str, -1);
1522 }
1523
1524 SCM
1525 scm_i_from_utf8_string (const scm_t_uint8 *str)
1526 {
1527   return scm_from_stringn ((const char *) str,
1528                            strlen ((char *) str), "UTF-8",
1529                            SCM_FAILED_CONVERSION_ERROR);
1530 }
1531
1532 /* Create a new scheme string from the C string STR.  The memory of
1533    STR may be used directly as storage for the new string.  */
1534 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1535    would be to register a finalizer to eventually free(3) STR, which isn't
1536    worth it.  Should we just deprecate the `scm_take_' functions?  */
1537 SCM
1538 scm_take_locale_stringn (char *str, size_t len)
1539 {
1540   SCM res;
1541
1542   res = scm_from_locale_stringn (str, len);
1543   free (str);
1544
1545   return res;
1546 }
1547
1548 SCM
1549 scm_take_locale_string (char *str)
1550 {
1551   return scm_take_locale_stringn (str, -1);
1552 }
1553
1554 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1555    and \UXXXXXX.  */
1556 static void
1557 unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1558 {
1559   char *before, *after;
1560   size_t i, j;
1561
1562   before = *bufp;
1563   after = *bufp;
1564   i = 0;
1565   j = 0;
1566   while (i < *lenp)
1567     {
1568       if ((i <= *lenp - 6)
1569           && before[i] == '\\'
1570           && before[i + 1] == 'u'
1571           && before[i + 2] == '0' && before[i + 3] == '0')
1572         {
1573           /* Convert \u00NN to \xNN */
1574           after[j] = '\\';
1575           after[j + 1] = 'x';
1576           after[j + 2] = tolower ((int) before[i + 4]);
1577           after[j + 3] = tolower ((int) before[i + 5]);
1578           i += 6;
1579           j += 4;
1580         }
1581       else if ((i <= *lenp - 10)
1582                && before[i] == '\\'
1583                && before[i + 1] == 'U'
1584                && before[i + 2] == '0' && before[i + 3] == '0')
1585         {
1586           /* Convert \U00NNNNNN to \UNNNNNN */
1587           after[j] = '\\';
1588           after[j + 1] = 'U';
1589           after[j + 2] = tolower ((int) before[i + 4]);
1590           after[j + 3] = tolower ((int) before[i + 5]);
1591           after[j + 4] = tolower ((int) before[i + 6]);
1592           after[j + 5] = tolower ((int) before[i + 7]);
1593           after[j + 6] = tolower ((int) before[i + 8]);
1594           after[j + 7] = tolower ((int) before[i + 9]);
1595           i += 10;
1596           j += 8;
1597         }
1598       else
1599         {
1600           after[j] = before[i];
1601           i++;
1602           j++;
1603         }
1604     }
1605   *lenp = j;
1606   after = scm_realloc (after, j);
1607 }
1608
1609 /* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXXXX; */
1610 static void
1611 unistring_escapes_to_r6rs_escapes (char **bufp, size_t *lenp)
1612 {
1613   char *before, *after;
1614   size_t i, j;
1615   /* The worst case is if the input string contains all 4-digit hex escapes.
1616      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1617   size_t max_out_len = (*lenp * 7) / 6 + 1;
1618   size_t nzeros, ndigits;
1619
1620   before = *bufp;
1621   after = alloca (max_out_len);
1622   i = 0;
1623   j = 0;
1624   while (i < *lenp)
1625     {
1626       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1627           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1628         {
1629           if (before[i + 1] == 'u')
1630             ndigits = 4;
1631           else if (before[i + 1] == 'U')
1632             ndigits = 8;
1633           else
1634             abort ();
1635
1636           /* Add the R6RS hex escape initial sequence.  */
1637           after[j] = '\\';
1638           after[j + 1] = 'x';
1639
1640           /* Move string positions to the start of the hex numbers.  */
1641           i += 2;
1642           j += 2;
1643
1644           /* Find the number of initial zeros in this hex number.  */
1645           nzeros = 0;
1646           while (before[i + nzeros] == '0' && nzeros < ndigits)
1647             nzeros++;
1648
1649           /* Copy the number, skipping initial zeros, and then move the string
1650              positions.  */
1651           if (nzeros == ndigits)
1652             {
1653               after[j] = '0';
1654               i += ndigits;
1655               j += 1;
1656             }
1657           else
1658             {
1659               int pos;
1660               for (pos = 0; pos < ndigits - nzeros; pos++)
1661                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1662               i += ndigits;
1663               j += (ndigits - nzeros);
1664             }
1665
1666           /* Add terminating semicolon.  */
1667           after[j] = ';';
1668           j++;
1669         }
1670       else
1671         {
1672           after[j] = before[i];
1673           i++;
1674           j++;
1675         }
1676     }
1677   *lenp = j;
1678   before = scm_realloc (before, j);
1679   memcpy (before, after, j);
1680 }
1681
1682
1683 char *
1684 scm_to_locale_stringn (SCM str, size_t *lenp)
1685 {
1686   SCM outport;
1687   scm_t_port *pt;
1688   const char *enc;
1689
1690   outport = scm_current_output_port ();
1691   if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1692     {
1693       pt = SCM_PTAB_ENTRY (outport);
1694       enc = pt->encoding;
1695     }
1696   else
1697     enc = NULL;
1698
1699   return scm_to_stringn (str, lenp,
1700                          enc,
1701                          scm_i_get_conversion_strategy (SCM_BOOL_F));
1702 }
1703
1704 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1705    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
1706    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
1707    defined by HANDLER.  */
1708 char *
1709 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1710                 scm_t_string_failed_conversion_handler handler)
1711 {
1712   char *buf;
1713   size_t ilen, len, i;
1714   int ret;
1715   const char *enc;
1716
1717   if (!scm_is_string (str))
1718     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1719   ilen = scm_i_string_length (str);
1720
1721   if (ilen == 0)
1722     {
1723       buf = scm_malloc (1);
1724       buf[0] = '\0';
1725       if (lenp)
1726         *lenp = 0;
1727       return buf;
1728     }
1729
1730   if (lenp == NULL)
1731     for (i = 0; i < ilen; i++)
1732       if (scm_i_string_ref (str, i) == '\0')
1733         scm_misc_error (NULL,
1734                         "string contains #\\nul character: ~S",
1735                         scm_list_1 (str));
1736
1737   if (scm_i_is_narrow_string (str) && (encoding == NULL))
1738     {
1739       /* If using native Latin-1 encoding, just copy the string
1740          contents.  */
1741       if (lenp)
1742         {
1743           buf = scm_malloc (ilen);
1744           memcpy (buf, scm_i_string_chars (str), ilen);
1745           *lenp = ilen;
1746           return buf;
1747         }
1748       else
1749         {
1750           buf = scm_malloc (ilen + 1);
1751           memcpy (buf, scm_i_string_chars (str), ilen);
1752           buf[ilen] = '\0';
1753           return buf;
1754         }
1755     }
1756
1757
1758   buf = NULL;
1759   len = 0;
1760   enc = encoding;
1761   if (enc == NULL)
1762     enc = "ISO-8859-1";
1763   if (scm_i_is_narrow_string (str))
1764     {
1765       ret = mem_iconveh (scm_i_string_chars (str), ilen,
1766                          "ISO-8859-1", enc,
1767                          (enum iconv_ilseq_handler) handler, NULL,
1768                          &buf, &len);
1769
1770       if (ret != 0)
1771         scm_encoding_error (__func__, errno,
1772                             "cannot convert to output locale",
1773                             "ISO-8859-1", enc, str);
1774     }
1775   else
1776     {
1777       buf = u32_conv_to_encoding (enc,
1778                                   (enum iconv_ilseq_handler) handler,
1779                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
1780                                   ilen,
1781                                   NULL,
1782                                   NULL, &len);
1783       if (buf == NULL)
1784         scm_encoding_error (__func__, errno,
1785                             "cannot convert to output locale",
1786                             "UTF-32", enc, str);
1787     }
1788   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1789     {
1790       if (SCM_R6RS_ESCAPES_P)
1791         unistring_escapes_to_r6rs_escapes (&buf, &len);
1792       else
1793         unistring_escapes_to_guile_escapes (&buf, &len);
1794     }
1795   if (lenp)
1796     *lenp = len;
1797   else
1798     {
1799       buf = scm_realloc (buf, len + 1);
1800       buf[len] = '\0';
1801     }
1802
1803   scm_remember_upto_here_1 (str);
1804   return buf;
1805 }
1806
1807 char *
1808 scm_to_locale_string (SCM str)
1809 {
1810   return scm_to_locale_stringn (str, NULL);
1811 }
1812
1813 scm_t_uint8 *
1814 scm_i_to_utf8_string (SCM str)
1815 {
1816   char *u8str;
1817   u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1818   return (scm_t_uint8 *) u8str;
1819 }
1820
1821 size_t
1822 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1823 {
1824   size_t len;
1825   char *result = NULL;
1826   if (!scm_is_string (str))
1827     scm_wrong_type_arg_msg (NULL, 0, str, "string");
1828   result = scm_to_locale_stringn (str, &len);
1829
1830   memcpy (buf, result, (len > max_len) ? max_len : len);
1831   free (result);
1832
1833   scm_remember_upto_here_1 (str);
1834   return len;
1835 }
1836
1837 \f
1838 /* Unicode string normalization.  */
1839
1840 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1841    libguile/i18n.c.  It would be useful to have this factored out into a more
1842    convenient location, but its use of alloca makes that tricky to do. */
1843
1844 static SCM
1845 normalize_str (SCM string, uninorm_t form)
1846 {
1847   SCM ret;
1848   scm_t_uint32 *w_str;
1849   scm_t_wchar *cbuf;
1850   size_t rlen, len = scm_i_string_length (string);
1851
1852   if (scm_i_is_narrow_string (string))
1853     {
1854       size_t i;
1855       const char *buf = scm_i_string_chars (string);
1856
1857       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1858
1859       for (i = 0; i < len; i ++)
1860         w_str[i] = (unsigned char) buf[i];
1861       w_str[len] = 0;
1862     }
1863   else
1864     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1865
1866   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1867
1868   ret = scm_i_make_wide_string (rlen, &cbuf);
1869   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1870   free (w_str);
1871
1872   scm_i_try_narrow_string (ret);
1873
1874   return ret;
1875 }
1876
1877 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1878             (SCM string),
1879             "Returns the NFC normalized form of @var{string}.")
1880 #define FUNC_NAME s_scm_string_normalize_nfc
1881 {
1882   SCM_VALIDATE_STRING (1, string);
1883   return normalize_str (string, UNINORM_NFC);
1884 }
1885 #undef FUNC_NAME
1886
1887 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1888             (SCM string),
1889             "Returns the NFD normalized form of @var{string}.")
1890 #define FUNC_NAME s_scm_string_normalize_nfd
1891 {
1892   SCM_VALIDATE_STRING (1, string);
1893   return normalize_str (string, UNINORM_NFD);
1894 }
1895 #undef FUNC_NAME
1896
1897 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1898             (SCM string),
1899             "Returns the NFKC normalized form of @var{string}.")
1900 #define FUNC_NAME s_scm_string_normalize_nfkc
1901 {
1902   SCM_VALIDATE_STRING (1, string);
1903   return normalize_str (string, UNINORM_NFKC);
1904 }
1905 #undef FUNC_NAME
1906
1907 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1908             (SCM string),
1909             "Returns the NFKD normalized form of @var{string}.")
1910 #define FUNC_NAME s_scm_string_normalize_nfkd
1911 {
1912   SCM_VALIDATE_STRING (1, string);
1913   return normalize_str (string, UNINORM_NFKD);
1914 }
1915 #undef FUNC_NAME
1916
1917 /* converts C scm_array of strings to SCM scm_list of strings. */
1918 /* If argc < 0, a null terminated scm_array is assumed. */
1919 SCM
1920 scm_makfromstrs (int argc, char **argv)
1921 {
1922   int i = argc;
1923   SCM lst = SCM_EOL;
1924   if (0 > i)
1925     for (i = 0; argv[i]; i++);
1926   while (i--)
1927     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1928   return lst;
1929 }
1930
1931 /* Return a newly allocated array of char pointers to each of the strings
1932    in args, with a terminating NULL pointer.  */
1933
1934 char **
1935 scm_i_allocate_string_pointers (SCM list)
1936 #define FUNC_NAME "scm_i_allocate_string_pointers"
1937 {
1938   char **result;
1939   int len = scm_ilength (list);
1940   int i;
1941
1942   if (len < 0)
1943     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1944
1945   result = scm_gc_malloc ((len + 1) * sizeof (char *),
1946                           "string pointers");
1947   result[len] = NULL;
1948
1949   /* The list might be have been modified in another thread, so
1950      we check LIST before each access.
1951    */
1952   for (i = 0; i < len && scm_is_pair (list); i++)
1953     {
1954       SCM str;
1955       size_t len;
1956
1957       str = SCM_CAR (list);
1958       len = scm_c_string_length (str);
1959
1960       result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1961       memcpy (result[i], scm_i_string_chars (str), len);
1962       result[i][len] = '\0';
1963
1964       list = SCM_CDR (list);
1965     }
1966
1967   return result;
1968 }
1969 #undef FUNC_NAME
1970
1971 void
1972 scm_i_get_substring_spec (size_t len,
1973                           SCM start, size_t *cstart,
1974                           SCM end, size_t *cend)
1975 {
1976   if (SCM_UNBNDP (start))
1977     *cstart = 0;
1978   else
1979     *cstart = scm_to_unsigned_integer (start, 0, len);
1980
1981   if (SCM_UNBNDP (end))
1982     *cend = len;
1983   else
1984     *cend = scm_to_unsigned_integer (end, *cstart, len);
1985 }
1986
1987 #if SCM_ENABLE_DEPRECATED
1988
1989 /* When these definitions are removed, it becomes reasonable to use
1990    read-only strings for string literals.  For that, change the reader
1991    to create string literals with scm_c_substring_read_only instead of
1992    with scm_c_substring_copy.
1993 */
1994
1995 int
1996 scm_i_deprecated_stringp (SCM str)
1997 {
1998   scm_c_issue_deprecation_warning
1999     ("SCM_STRINGP is deprecated.  Use scm_is_string instead.");
2000
2001   return scm_is_string (str);
2002 }
2003
2004 char *
2005 scm_i_deprecated_string_chars (SCM str)
2006 {
2007   char *chars;
2008
2009   scm_c_issue_deprecation_warning
2010     ("SCM_STRING_CHARS is deprecated.  See the manual for alternatives.");
2011
2012   /* We don't accept shared substrings here since they are not
2013      null-terminated.
2014   */
2015   if (IS_SH_STRING (str))
2016     scm_misc_error (NULL,
2017                     "SCM_STRING_CHARS does not work with shared substrings",
2018                     SCM_EOL);
2019
2020   /* We explicitly test for read-only strings to produce a better
2021      error message.
2022   */
2023
2024   if (IS_RO_STRING (str))
2025     scm_misc_error (NULL,
2026                     "SCM_STRING_CHARS does not work with read-only strings",
2027                     SCM_EOL);
2028
2029   /* The following is still wrong, of course...
2030    */
2031   str = scm_i_string_start_writing (str);
2032   chars = scm_i_string_writable_chars (str);
2033   scm_i_string_stop_writing ();
2034   return chars;
2035 }
2036
2037 size_t
2038 scm_i_deprecated_string_length (SCM str)
2039 {
2040   scm_c_issue_deprecation_warning
2041     ("SCM_STRING_LENGTH is deprecated.  Use scm_c_string_length instead.");
2042   return scm_c_string_length (str);
2043 }
2044
2045 #endif
2046
2047 static SCM
2048 string_handle_ref (scm_t_array_handle *h, size_t index)
2049 {
2050   return scm_c_string_ref (h->array, index);
2051 }
2052
2053 static void
2054 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2055 {
2056   scm_c_string_set_x (h->array, index, val);
2057 }
2058
2059 static void
2060 string_get_handle (SCM v, scm_t_array_handle *h)
2061 {
2062   h->array = v;
2063   h->ndims = 1;
2064   h->dims = &h->dim0;
2065   h->dim0.lbnd = 0;
2066   h->dim0.ubnd = scm_c_string_length (v) - 1;
2067   h->dim0.inc = 1;
2068   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2069   h->elements = h->writable_elements = NULL;
2070 }
2071
2072 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2073                           string_handle_ref, string_handle_set,
2074                           string_get_handle)
2075 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2076
2077 void
2078 scm_init_strings ()
2079 {
2080   scm_nullstr = scm_i_make_string (0, NULL);
2081
2082 #include "libguile/strings.x"
2083 }
2084
2085
2086 /*
2087   Local Variables:
2088   c-file-style: "gnu"
2089   End:
2090 */