libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32 #include <c-strcase.h>
  33
  34 #include "striconveh.h"
  35
  36 #include "libguile/_scm.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/root.h"
  39 #include "libguile/strings.h"
  40 #include "libguile/ports.h"
  41 #include "libguile/ports-internal.h"
  42 #include "libguile/error.h"
  43 #include "libguile/generalized-vectors.h"
  44 #include "libguile/deprecation.h"
  45 #include "libguile/validate.h"
  46 #include "libguile/private-options.h"
  47
  48 \f
  49
  50 /* {Strings}
  51  */
  52
  53
  54 /* Stringbufs
  55  *
  56  * XXX - keeping an accurate refcount during GC seems to be quite
  57  * tricky, so we just keep score of whether a stringbuf might be
  58  * shared, not whether it definitely is.
  59  *
  60  * The scheme I (mvo) tried to keep an accurate reference count would
  61  * recount all strings that point to a stringbuf during the mark-phase
  62  * of the GC.  This was done since one cannot access the stringbuf of
  63  * a string when that string is freed (in order to decrease the
  64  * reference count).  The memory of the stringbuf might have been
  65  * reused already for something completely different.
  66  *
  67  * This recounted worked for a small number of threads beating on
  68  * cow-strings, but it failed randomly with more than 10 threads, say.
  69  * I couldn't figure out what went wrong, so I used the conservative
  70  * approach implemented below.
  71  *
  72  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  73  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  74  * strings.
  75  */
  76
  77 /* The size in words of the stringbuf header (type tag + size).  */
  78 #define STRINGBUF_HEADER_SIZE   2U
  79
  80 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  81
  82 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  83 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  84
  85 #define STRINGBUF_TAG           scm_tc7_stringbuf
  86 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  87 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  88
  89 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  90                                  SCM_CELL_OBJECT_LOC (buf,              \
  91                                                       STRINGBUF_HEADER_SIZE))
  92 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  94
  95 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  96
  97 #define SET_STRINGBUF_SHARED(buf)                                       \
  98   do                                                                    \
  99     {                                                                   \
 100       /* Don't modify BUF if it's already marked as shared since it might be \
 101          a read-only, statically allocated stringbuf.  */               \
 102       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 103         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 104     }                                                                   \
 105   while (0)
 106
 107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 108 static size_t lenhist[1001];
 109 #endif
 110
 111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 112    characters. */
 113 static SCM
 114 make_stringbuf (size_t len)
 115 {
 116   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 117      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 118      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 119      has been changed for scm_i_symbol_chars, this null-termination
 120      can be dropped.
 121   */
 122
 123   SCM buf;
 124
 125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 126   if (len < 1000)
 127     lenhist[len]++;
 128   else
 129     lenhist[1000]++;
 130 #endif
 131
 132   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 133                                             "string"));
 134
 135   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 136   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 137
 138   STRINGBUF_CHARS (buf)[len] = 0;
 139
 140   return buf;
 141 }
 142
 143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 144    characters.  */
 145 static SCM
 146 make_wide_stringbuf (size_t len)
 147 {
 148   SCM buf;
 149   size_t raw_len;
 150
 151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 152   if (len < 1000)
 153     lenhist[len]++;
 154   else
 155     lenhist[1000]++;
 156 #endif
 157
 158   raw_len = (len + 1) * sizeof (scm_t_wchar);
 159   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 160                                             "string"));
 161
 162   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 163   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 164
 165   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 166
 167   return buf;
 168 }
 169
 170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 171    characters from BUF.  */
 172 static SCM
 173 wide_stringbuf (SCM buf)
 174 {
 175   SCM new_buf;
 176
 177   if (STRINGBUF_WIDE (buf))
 178     new_buf = buf;
 179   else
 180     {
 181       size_t i, len;
 182       scm_t_wchar *mem;
 183
 184       len = STRINGBUF_LENGTH (buf);
 185
 186       new_buf = make_wide_stringbuf (len);
 187
 188       mem = STRINGBUF_WIDE_CHARS (new_buf);
 189       for (i = 0; i < len; i++)
 190         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 191       mem[len] = 0;
 192     }
 193
 194   return new_buf;
 195 }
 196
 197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 198    characters from BUF, if possible.  */
 199 static SCM
 200 narrow_stringbuf (SCM buf)
 201 {
 202   SCM new_buf;
 203
 204   if (!STRINGBUF_WIDE (buf))
 205     new_buf = buf;
 206   else
 207     {
 208       size_t i, len;
 209       scm_t_wchar *wmem;
 210       unsigned char *mem;
 211
 212       len = STRINGBUF_LENGTH (buf);
 213       wmem = STRINGBUF_WIDE_CHARS (buf);
 214
 215       for (i = 0; i < len; i++)
 216         if (wmem[i] > 0xFF)
 217           /* BUF cannot be narrowed.  */
 218           return buf;
 219
 220       new_buf = make_stringbuf (len);
 221
 222       mem = STRINGBUF_CHARS (new_buf);
 223       for (i = 0; i < len; i++)
 224         mem[i] = (unsigned char) wmem[i];
 225       mem[len] = 0;
 226     }
 227
 228   return new_buf;
 229 }
 230
 231 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 232
 233 \f
 234 /* Copy-on-write strings.
 235  */
 236
 237 #define STRING_TAG            scm_tc7_string
 238
 239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 240 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 241 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 242
 243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 245
 246 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 247
 248 /* Read-only strings.
 249  */
 250
 251 #define RO_STRING_TAG         scm_tc7_ro_string
 252 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 253
 254 /* Mutation-sharing substrings
 255  */
 256
 257 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 258
 259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 260 /* START and LENGTH as for STRINGs. */
 261
 262 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 263
 264 void
 265 scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate)
 266 {
 267   SCM str;
 268
 269   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 270   SET_STRINGBUF_SHARED (exp);
 271   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 272
 273   str =  scm_double_cell (RO_STRING_TAG, SCM_UNPACK(exp),
 274                           0, STRINGBUF_LENGTH (exp));
 275
 276   scm_puts ("#<stringbuf ", port);
 277   scm_iprin1 (str, port, pstate);
 278   scm_puts (">", port);
 279 }
 280
 281 SCM scm_nullstr;
 282
 283 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 284    characters.  CHARSP, if not NULL, will be set to location of the
 285    char array.  If READ_ONLY_P, the returned string is read-only;
 286    otherwise it is writable.  */
 287 SCM
 288 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 289 {
 290   static SCM null_stringbuf = SCM_BOOL_F;
 291   SCM buf;
 292   SCM res;
 293
 294   if (len == 0)
 295     {
 296       if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
 297         {
 298           null_stringbuf = make_stringbuf (0);
 299           SET_STRINGBUF_SHARED (null_stringbuf);
 300         }
 301       buf = null_stringbuf;
 302     }
 303   else
 304     buf = make_stringbuf (len);
 305
 306   if (charsp)
 307     *charsp = (char *) STRINGBUF_CHARS (buf);
 308   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 309                          SCM_UNPACK (buf),
 310                          (scm_t_bits) 0, (scm_t_bits) len);
 311   return res;
 312 }
 313
 314 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 315    characters.  CHARSP, if not NULL, will be set to location of the
 316    character array.  If READ_ONLY_P, the returned string is read-only;
 317    otherwise it is writable.  */
 318 SCM
 319 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 320 {
 321   SCM buf = make_wide_stringbuf (len);
 322   SCM res;
 323   if (charsp)
 324     *charsp = STRINGBUF_WIDE_CHARS (buf);
 325   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 326                          SCM_UNPACK (buf),
 327                          (scm_t_bits) 0, (scm_t_bits) len);
 328   return res;
 329 }
 330
 331 static void
 332 validate_substring_args (SCM str, size_t start, size_t end)
 333 {
 334   if (!IS_STRING (str))
 335     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 336   if (start > STRING_LENGTH (str))
 337     scm_out_of_range (NULL, scm_from_size_t (start));
 338   if (end > STRING_LENGTH (str) || end < start)
 339     scm_out_of_range (NULL, scm_from_size_t (end));
 340 }
 341
 342 static inline void
 343 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 344 {
 345   *start = STRING_START (*str);
 346   if (IS_SH_STRING (*str))
 347     {
 348       *str = SH_STRING_STRING (*str);
 349       *start += STRING_START (*str);
 350     }
 351   *buf = STRING_STRINGBUF (*str);
 352 }
 353
 354 SCM
 355 scm_i_substring (SCM str, size_t start, size_t end)
 356 {
 357   if (start == end)
 358     return scm_i_make_string (0, NULL, 0);
 359   else
 360     {
 361       SCM buf;
 362       size_t str_start;
 363       get_str_buf_start (&str, &buf, &str_start);
 364       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 365       SET_STRINGBUF_SHARED (buf);
 366       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 367       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 368                               (scm_t_bits)str_start + start,
 369                               (scm_t_bits) end - start);
 370     }
 371 }
 372
 373 SCM
 374 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 375 {
 376   if (start == end)
 377     return scm_i_make_string (0, NULL, 1);
 378   else
 379     {
 380       SCM buf;
 381       size_t str_start;
 382       get_str_buf_start (&str, &buf, &str_start);
 383       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 384       SET_STRINGBUF_SHARED (buf);
 385       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 386       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 387                               (scm_t_bits)str_start + start,
 388                               (scm_t_bits) end - start);
 389     }
 390 }
 391
 392 SCM
 393 scm_i_substring_copy (SCM str, size_t start, size_t end)
 394 {
 395   if (start == end)
 396     return scm_i_make_string (0, NULL, 0);
 397   else
 398     {
 399       size_t len = end - start;
 400       SCM buf, my_buf, substr;
 401       size_t str_start;
 402       int wide = 0;
 403       get_str_buf_start (&str, &buf, &str_start);
 404       if (scm_i_is_narrow_string (str))
 405         {
 406           my_buf = make_stringbuf (len);
 407           memcpy (STRINGBUF_CHARS (my_buf),
 408                   STRINGBUF_CHARS (buf) + str_start + start, len);
 409         }
 410       else
 411         {
 412           my_buf = make_wide_stringbuf (len);
 413           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 414                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 415                                      + start), len);
 416           wide = 1;
 417         }
 418       scm_remember_upto_here_1 (buf);
 419       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 420                                 (scm_t_bits) 0, (scm_t_bits) len);
 421       if (wide)
 422         scm_i_try_narrow_string (substr);
 423       return substr;
 424     }
 425 }
 426
 427 SCM
 428 scm_i_substring_shared (SCM str, size_t start, size_t end)
 429 {
 430   if (start == 0 && end == STRING_LENGTH (str))
 431     return str;
 432   else if (start == end)
 433     return scm_i_make_string (0, NULL, 0);
 434   else
 435     {
 436       size_t len = end - start;
 437       if (IS_SH_STRING (str))
 438         {
 439           start += STRING_START (str);
 440           str = SH_STRING_STRING (str);
 441         }
 442       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 443                               (scm_t_bits)start, (scm_t_bits) len);
 444     }
 445 }
 446
 447 SCM
 448 scm_c_substring (SCM str, size_t start, size_t end)
 449 {
 450   validate_substring_args (str, start, end);
 451   return scm_i_substring (str, start, end);
 452 }
 453
 454 SCM
 455 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 456 {
 457   validate_substring_args (str, start, end);
 458   return scm_i_substring_read_only (str, start, end);
 459 }
 460
 461 SCM
 462 scm_c_substring_copy (SCM str, size_t start, size_t end)
 463 {
 464   validate_substring_args (str, start, end);
 465   return scm_i_substring_copy (str, start, end);
 466 }
 467
 468 SCM
 469 scm_c_substring_shared (SCM str, size_t start, size_t end)
 470 {
 471   validate_substring_args (str, start, end);
 472   return scm_i_substring_shared (str, start, end);
 473 }
 474
 475 \f
 476 /* Internal accessors
 477  */
 478
 479 /* Returns the number of characters in STR.  This may be different
 480    than the memory size of the string storage.  */
 481 size_t
 482 scm_i_string_length (SCM str)
 483 {
 484   return STRING_LENGTH (str);
 485 }
 486
 487 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 488    encoding.  False if it is 'wide', having a 32-bit UCS-4
 489    encoding.  */
 490 int
 491 scm_i_is_narrow_string (SCM str)
 492 {
 493   if (IS_SH_STRING (str))
 494     str = SH_STRING_STRING (str);
 495
 496   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 497 }
 498
 499 /* Try to coerce a string to be narrow.  It if is narrow already, do
 500    nothing.  If it is wide, shrink it to narrow if none of its
 501    characters are above 0xFF.  Return true if the string is narrow or
 502    was made to be narrow.  */
 503 int
 504 scm_i_try_narrow_string (SCM str)
 505 {
 506   if (IS_SH_STRING (str))
 507     str = SH_STRING_STRING (str);
 508
 509   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 510
 511   return scm_i_is_narrow_string (str);
 512 }
 513
 514 /* Return a pointer to the raw data of the string, which can be either Latin-1
 515    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 516 const void *
 517 scm_i_string_data (SCM str)
 518 {
 519   SCM buf;
 520   size_t start;
 521   const char *data;
 522
 523   get_str_buf_start (&str, &buf, &start);
 524
 525   data = STRINGBUF_CONTENTS (buf);
 526   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 527
 528   return data;
 529 }
 530
 531 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 532    STR.  */
 533 const char *
 534 scm_i_string_chars (SCM str)
 535 {
 536   SCM buf;
 537   size_t start;
 538   get_str_buf_start (&str, &buf, &start);
 539   if (scm_i_is_narrow_string (str))
 540     return (const char *) STRINGBUF_CHARS (buf) + start;
 541   else
 542     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 543                     scm_list_1 (str));
 544   return NULL;
 545 }
 546
 547 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 548    STR.  */
 549 const scm_t_wchar *
 550 scm_i_string_wide_chars (SCM str)
 551 {
 552   SCM buf;
 553   size_t start;
 554
 555   get_str_buf_start (&str, &buf, &start);
 556   if (!scm_i_is_narrow_string (str))
 557     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 558   else
 559     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 560                     scm_list_1 (str));
 561 }
 562
 563 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 564    a new string buffer, so that it can be modified without modifying
 565    other strings.  Also, lock the string mutex.  Later, one must call
 566    scm_i_string_stop_writing to unlock the mutex.  */
 567 SCM
 568 scm_i_string_start_writing (SCM orig_str)
 569 {
 570   SCM buf, str = orig_str;
 571   size_t start;
 572
 573   get_str_buf_start (&str, &buf, &start);
 574   if (IS_RO_STRING (str))
 575     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 576
 577   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 578   if (STRINGBUF_SHARED (buf))
 579     {
 580       /* Clone the stringbuf.  */
 581       size_t len = STRING_LENGTH (str);
 582       SCM new_buf;
 583
 584       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 585
 586       if (scm_i_is_narrow_string (str))
 587         {
 588           new_buf = make_stringbuf (len);
 589           memcpy (STRINGBUF_CHARS (new_buf),
 590                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 591
 592         }
 593       else
 594         {
 595           new_buf = make_wide_stringbuf (len);
 596           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 597                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 598                                      + STRING_START (str)), len);
 599         }
 600
 601       SET_STRING_STRINGBUF (str, new_buf);
 602       start -= STRING_START (str);
 603
 604       /* FIXME: The following operations are not atomic, so other threads
 605          looking at STR may see an inconsistent state.  Nevertheless it can't
 606          hurt much since (i) accessing STR while it is being mutated can't
 607          yield a crash, and (ii) concurrent accesses to STR should be
 608          protected by a mutex at the application level.  The latter may not
 609          apply when STR != ORIG_STR, though.  */
 610       SET_STRING_START (str, 0);
 611       SET_STRING_STRINGBUF (str, new_buf);
 612
 613       buf = new_buf;
 614
 615       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 616     }
 617   return orig_str;
 618 }
 619
 620 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 621 char *
 622 scm_i_string_writable_chars (SCM str)
 623 {
 624   SCM buf;
 625   size_t start;
 626
 627   get_str_buf_start (&str, &buf, &start);
 628   if (scm_i_is_narrow_string (str))
 629     return (char *) STRINGBUF_CHARS (buf) + start;
 630   else
 631     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 632                     scm_list_1 (str));
 633   return NULL;
 634 }
 635
 636 /* Return a pointer to the UCS-4 codepoints of a string.  */
 637 static scm_t_wchar *
 638 scm_i_string_writable_wide_chars (SCM str)
 639 {
 640   SCM buf;
 641   size_t start;
 642
 643   get_str_buf_start (&str, &buf, &start);
 644   if (!scm_i_is_narrow_string (str))
 645     return STRINGBUF_WIDE_CHARS (buf) + start;
 646   else
 647     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 648                     scm_list_1 (str));
 649 }
 650
 651 /* Unlock the string mutex that was locked when
 652    scm_i_string_start_writing was called.  */
 653 void
 654 scm_i_string_stop_writing (void)
 655 {
 656   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 657 }
 658
 659 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 660 scm_t_wchar
 661 scm_i_string_ref (SCM str, size_t x)
 662 {
 663   if (scm_i_is_narrow_string (str))
 664     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 665   else
 666     return scm_i_string_wide_chars (str)[x];
 667 }
 668
 669 /* Returns index+1 of the first char in STR that matches C, or
 670    0 if the char is not found.  */
 671 int
 672 scm_i_string_contains_char (SCM str, char ch)
 673 {
 674   size_t i;
 675   size_t len = scm_i_string_length (str);
 676
 677   i = 0;
 678   if (scm_i_is_narrow_string (str))
 679     {
 680       while (i < len)
 681         {
 682           if (scm_i_string_chars (str)[i] == ch)
 683             return i+1;
 684           i++;
 685         }
 686     }
 687   else
 688     {
 689       while (i < len)
 690         {
 691           if (scm_i_string_wide_chars (str)[i]
 692               == (unsigned char) ch)
 693             return i+1;
 694           i++;
 695         }
 696     }
 697   return 0;
 698 }
 699
 700 int
 701 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 702 {
 703   if (scm_i_is_narrow_string (sstr))
 704     {
 705       const char *a = scm_i_string_chars (sstr) + start_x;
 706       const char *b = cstr;
 707       return strncmp (a, b, strlen(b));
 708     }
 709   else
 710     {
 711       size_t i;
 712       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 713       const char *b = cstr;
 714       for (i = 0; i < strlen (b); i++)
 715         {
 716           if (a[i] != (unsigned char) b[i])
 717             return 1;
 718         }
 719     }
 720   return 0;
 721 }
 722
 723 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 724 void
 725 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 726 {
 727   if (IS_SH_STRING (str))
 728     {
 729       p += STRING_START (str);
 730       str = SH_STRING_STRING (str);
 731     }
 732
 733   if (chr > 0xFF && scm_i_is_narrow_string (str))
 734     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 735
 736   if (scm_i_is_narrow_string (str))
 737     {
 738       char *dst = scm_i_string_writable_chars (str);
 739       dst[p] = chr;
 740     }
 741   else
 742     {
 743       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 744       dst[p] = chr;
 745     }
 746 }
 747
 748 \f
 749 /* Symbols.
 750
 751    Basic symbol creation and accessing is done here, the rest is in
 752    symbols.[hc].  This has been done to keep stringbufs and the
 753    internals of strings and string-like objects confined to this file.
 754 */
 755
 756 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 757
 758 SCM
 759 scm_i_make_symbol (SCM name, scm_t_bits flags,
 760                    unsigned long hash, SCM props)
 761 {
 762   SCM buf;
 763   size_t start = STRING_START (name);
 764   size_t length = STRING_LENGTH (name);
 765
 766   if (IS_SH_STRING (name))
 767     {
 768       name = SH_STRING_STRING (name);
 769       start += STRING_START (name);
 770     }
 771   buf = STRING_STRINGBUF (name);
 772
 773   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 774     {
 775       /* reuse buf. */
 776       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 777       SET_STRINGBUF_SHARED (buf);
 778       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 779     }
 780   else
 781     {
 782       /* make new buf. */
 783       if (scm_i_is_narrow_string (name))
 784         {
 785           SCM new_buf = make_stringbuf (length);
 786           memcpy (STRINGBUF_CHARS (new_buf),
 787                   STRINGBUF_CHARS (buf) + start, length);
 788           buf = new_buf;
 789         }
 790       else
 791         {
 792           SCM new_buf = make_wide_stringbuf (length);
 793           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 794                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 795                    length);
 796           buf = new_buf;
 797         }
 798     }
 799   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 800                           (scm_t_bits) hash, SCM_UNPACK (props));
 801 }
 802
 803 SCM
 804 scm_i_c_make_symbol (const char *name, size_t len,
 805                      scm_t_bits flags, unsigned long hash, SCM props)
 806 {
 807   SCM buf = make_stringbuf (len);
 808   memcpy (STRINGBUF_CHARS (buf), name, len);
 809
 810   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 811                           (scm_t_bits) hash, SCM_UNPACK (props));
 812 }
 813
 814 /* Returns the number of characters in SYM.  This may be different
 815    from the memory size of SYM.  */
 816 size_t
 817 scm_i_symbol_length (SCM sym)
 818 {
 819   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 820 }
 821
 822 size_t
 823 scm_c_symbol_length (SCM sym)
 824 #define FUNC_NAME "scm_c_symbol_length"
 825 {
 826   SCM_VALIDATE_SYMBOL (1, sym);
 827
 828   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 829 }
 830 #undef FUNC_NAME
 831
 832 /* True if the name of SYM is stored as a Latin-1 encoded string.
 833    False if it is stored as a 32-bit UCS-4-encoded string.  */
 834 int
 835 scm_i_is_narrow_symbol (SCM sym)
 836 {
 837   SCM buf;
 838
 839   buf = SYMBOL_STRINGBUF (sym);
 840   return !STRINGBUF_WIDE (buf);
 841 }
 842
 843 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 844    contains the name of SYM.  */
 845 const char *
 846 scm_i_symbol_chars (SCM sym)
 847 {
 848   SCM buf;
 849
 850   buf = SYMBOL_STRINGBUF (sym);
 851   if (!STRINGBUF_WIDE (buf))
 852     return (const char *) STRINGBUF_CHARS (buf);
 853   else
 854     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 855                     scm_list_1 (sym));
 856 }
 857
 858 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 859    symbol's name.  */
 860 const scm_t_wchar *
 861 scm_i_symbol_wide_chars (SCM sym)
 862 {
 863   SCM buf;
 864
 865   buf = SYMBOL_STRINGBUF (sym);
 866   if (STRINGBUF_WIDE (buf))
 867     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 868   else
 869     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 870                     scm_list_1 (sym));
 871 }
 872
 873 SCM
 874 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 875 {
 876   SCM buf = SYMBOL_STRINGBUF (sym);
 877   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 878   SET_STRINGBUF_SHARED (buf);
 879   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 880   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 881                           (scm_t_bits)start, (scm_t_bits) end - start);
 882 }
 883
 884 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 885 scm_t_wchar
 886 scm_i_symbol_ref (SCM sym, size_t x)
 887 {
 888   if (scm_i_is_narrow_symbol (sym))
 889     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 890   else
 891     return scm_i_symbol_wide_chars (sym)[x];
 892 }
 893
 894 /* Debugging
 895  */
 896
 897 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 898             "Returns an association list containing debugging information\n"
 899             "for @var{str}. The association list has the following entries."
 900             "@table @code\n"
 901             "@item string\n"
 902             "The string itself.\n"
 903             "@item start\n"
 904             "The start index of the string into its stringbuf\n"
 905             "@item length\n"
 906             "The length of the string\n"
 907             "@item shared\n"
 908             "If this string is a substring, it returns its parent string.\n"
 909             "Otherwise, it returns @code{#f}\n"
 910             "@item read-only\n"
 911             "@code{#t} if the string is read-only\n"
 912             "@item stringbuf-chars\n"
 913             "A new string containing this string's stringbuf's characters\n"
 914             "@item stringbuf-length\n"
 915             "The number of characters in this stringbuf\n"
 916             "@item stringbuf-shared\n"
 917             "@code{#t} if this stringbuf is shared\n"
 918             "@item stringbuf-wide\n"
 919             "@code{#t} if this stringbuf's characters are stored in a\n"
 920             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 921             "buffer\n"
 922             "@end table")
 923 #define FUNC_NAME s_scm_sys_string_dump
 924 {
 925   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 926   SCM buf;
 927   SCM_VALIDATE_STRING (1, str);
 928
 929   /* String info */
 930   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 931                  str);
 932   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 933                  scm_from_size_t (STRING_START (str)));
 934   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 935                  scm_from_size_t (STRING_LENGTH (str)));
 936
 937   if (IS_SH_STRING (str))
 938     {
 939       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 940                      SH_STRING_STRING (str));
 941       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 942     }
 943   else
 944     {
 945       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 946                      SCM_BOOL_F);
 947       buf = STRING_STRINGBUF (str);
 948     }
 949
 950   if (IS_RO_STRING (str))
 951     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 952                    SCM_BOOL_T);
 953   else
 954     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 955                    SCM_BOOL_F);
 956
 957   /* Stringbuf info */
 958   if (!STRINGBUF_WIDE (buf))
 959     {
 960       size_t len = STRINGBUF_LENGTH (buf);
 961       char *cbuf;
 962       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 963       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 964       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 965                      sbc);
 966     }
 967   else
 968     {
 969       size_t len = STRINGBUF_LENGTH (buf);
 970       scm_t_wchar *cbuf;
 971       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 972       u32_cpy ((scm_t_uint32 *) cbuf,
 973                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 974       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 975                      sbc);
 976     }
 977   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 978                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 979   if (STRINGBUF_SHARED (buf))
 980     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 981                    SCM_BOOL_T);
 982   else
 983     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 984                    SCM_BOOL_F);
 985   if (STRINGBUF_WIDE (buf))
 986     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 987                    SCM_BOOL_T);
 988   else
 989     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 990                    SCM_BOOL_F);
 991
 992   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 993 }
 994 #undef FUNC_NAME
 995
 996 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 997             "Returns an association list containing debugging information\n"
 998             "for @var{sym}. The association list has the following entries."
 999             "@table @code\n"
1000             "@item symbol\n"
1001             "The symbol itself\n"
1002             "@item hash\n"
1003             "Its hash value\n"
1004             "@item interned\n"
1005             "@code{#t} if it is an interned symbol\n"
1006             "@item stringbuf-chars\n"
1007             "A new string containing this symbols's stringbuf's characters\n"
1008             "@item stringbuf-length\n"
1009             "The number of characters in this stringbuf\n"
1010             "@item stringbuf-shared\n"
1011             "@code{#t} if this stringbuf is shared\n"
1012             "@item stringbuf-wide\n"
1013             "@code{#t} if this stringbuf's characters are stored in a\n"
1014             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1015             "buffer\n"
1016             "@end table")
1017 #define FUNC_NAME s_scm_sys_symbol_dump
1018 {
1019   SCM e1, e2, e3, e4, e5, e6, e7;
1020   SCM buf;
1021   SCM_VALIDATE_SYMBOL (1, sym);
1022   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1023                  sym);
1024   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1025                  scm_from_ulong (scm_i_symbol_hash (sym)));
1026   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1027                  scm_symbol_interned_p (sym));
1028   buf = SYMBOL_STRINGBUF (sym);
1029
1030   /* Stringbuf info */
1031   if (!STRINGBUF_WIDE (buf))
1032     {
1033       size_t len = STRINGBUF_LENGTH (buf);
1034       char *cbuf;
1035       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1036       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1037       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1038                      sbc);
1039     }
1040   else
1041     {
1042       size_t len = STRINGBUF_LENGTH (buf);
1043       scm_t_wchar *cbuf;
1044       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1045       u32_cpy ((scm_t_uint32 *) cbuf,
1046                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1047       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1048                      sbc);
1049     }
1050   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1051                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1052   if (STRINGBUF_SHARED (buf))
1053     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1054                    SCM_BOOL_T);
1055   else
1056     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1057                    SCM_BOOL_F);
1058   if (STRINGBUF_WIDE (buf))
1059     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1060                     SCM_BOOL_T);
1061   else
1062     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1063                     SCM_BOOL_F);
1064   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1065
1066 }
1067 #undef FUNC_NAME
1068
1069 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1070
1071 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1072 #define FUNC_NAME s_scm_sys_stringbuf_hist
1073 {
1074   int i;
1075   for (i = 0; i < 1000; i++)
1076     if (lenhist[i])
1077       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1078   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1079   return SCM_UNSPECIFIED;
1080 }
1081 #undef FUNC_NAME
1082
1083 #endif
1084
1085 \f
1086
1087 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1088             (SCM obj),
1089             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1090 #define FUNC_NAME s_scm_string_p
1091 {
1092   return scm_from_bool (IS_STRING (obj));
1093 }
1094 #undef FUNC_NAME
1095
1096
1097 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1098
1099 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1100             (SCM chrs),
1101             "@deffnx {Scheme Procedure} list->string chrs\n"
1102             "Return a newly allocated string composed of the arguments,\n"
1103             "@var{chrs}.")
1104 #define FUNC_NAME s_scm_string
1105 {
1106   SCM result = SCM_BOOL_F;
1107   SCM rest;
1108   size_t len;
1109   size_t p = 0;
1110   long i;
1111   int wide = 0;
1112
1113   /* Verify that this is a list of chars.  */
1114   i = scm_ilength (chrs);
1115   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1116
1117   len = (size_t) i;
1118   rest = chrs;
1119
1120   while (len > 0 && scm_is_pair (rest))
1121     {
1122       SCM elt = SCM_CAR (rest);
1123       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1124       if (SCM_CHAR (elt) > 0xFF)
1125         wide = 1;
1126       rest = SCM_CDR (rest);
1127       len--;
1128       scm_remember_upto_here_1 (elt);
1129     }
1130
1131   /* Construct a string containing this list of chars.  */
1132   len = (size_t) i;
1133   rest = chrs;
1134
1135   if (wide == 0)
1136     {
1137       char *buf;
1138
1139       result = scm_i_make_string (len, NULL, 0);
1140       result = scm_i_string_start_writing (result);
1141       buf = scm_i_string_writable_chars (result);
1142       while (len > 0 && scm_is_pair (rest))
1143         {
1144           SCM elt = SCM_CAR (rest);
1145           buf[p] = (unsigned char) SCM_CHAR (elt);
1146           p++;
1147           rest = SCM_CDR (rest);
1148           len--;
1149           scm_remember_upto_here_1 (elt);
1150         }
1151     }
1152   else
1153     {
1154       scm_t_wchar *buf;
1155
1156       result = scm_i_make_wide_string (len, NULL, 0);
1157       result = scm_i_string_start_writing (result);
1158       buf = scm_i_string_writable_wide_chars (result);
1159       while (len > 0 && scm_is_pair (rest))
1160         {
1161           SCM elt = SCM_CAR (rest);
1162           buf[p] = SCM_CHAR (elt);
1163           p++;
1164           rest = SCM_CDR (rest);
1165           len--;
1166           scm_remember_upto_here_1 (elt);
1167         }
1168     }
1169   scm_i_string_stop_writing ();
1170
1171   if (len > 0)
1172     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1173   if (!scm_is_null (rest))
1174     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1175
1176   return result;
1177 }
1178 #undef FUNC_NAME
1179
1180 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1181             (SCM k, SCM chr),
1182             "Return a newly allocated string of\n"
1183             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1184             "the string are initialized to @var{chr}, otherwise the contents\n"
1185             "of the string are all set to @code{#\nul}.")
1186 #define FUNC_NAME s_scm_make_string
1187 {
1188   return scm_c_make_string (scm_to_size_t (k), chr);
1189 }
1190 #undef FUNC_NAME
1191
1192 SCM
1193 scm_c_make_string (size_t len, SCM chr)
1194 #define FUNC_NAME NULL
1195 {
1196   size_t p;
1197   char *contents = NULL;
1198   SCM res = scm_i_make_string (len, &contents, 0);
1199
1200   /* If no char is given, initialize string contents to NULL.  */
1201   if (SCM_UNBNDP (chr))
1202     memset (contents, 0, len);
1203   else
1204     {
1205       SCM_VALIDATE_CHAR (0, chr);
1206       res = scm_i_string_start_writing (res);
1207       for (p = 0; p < len; p++)
1208         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1209       scm_i_string_stop_writing ();
1210     }
1211
1212   return res;
1213 }
1214 #undef FUNC_NAME
1215
1216 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1217             (SCM string),
1218             "Return the number of characters in @var{string}.")
1219 #define FUNC_NAME s_scm_string_length
1220 {
1221   SCM_VALIDATE_STRING (1, string);
1222   return scm_from_size_t (STRING_LENGTH (string));
1223 }
1224 #undef FUNC_NAME
1225
1226 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1227             (SCM string),
1228             "Return the bytes used to represent a character in @var{string}."
1229             "This will return 1 or 4.")
1230 #define FUNC_NAME s_scm_string_bytes_per_char
1231 {
1232   SCM_VALIDATE_STRING (1, string);
1233   if (!scm_i_is_narrow_string (string))
1234     return scm_from_int (4);
1235
1236   return scm_from_int (1);
1237 }
1238 #undef FUNC_NAME
1239
1240 size_t
1241 scm_c_string_length (SCM string)
1242 {
1243   if (!IS_STRING (string))
1244     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1245   return STRING_LENGTH (string);
1246 }
1247
1248 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1249             (SCM str, SCM k),
1250             "Return character @var{k} of @var{str} using zero-origin\n"
1251             "indexing. @var{k} must be a valid index of @var{str}.")
1252 #define FUNC_NAME s_scm_string_ref
1253 {
1254   size_t len;
1255   unsigned long idx;
1256
1257   SCM_VALIDATE_STRING (1, str);
1258
1259   len = scm_i_string_length (str);
1260   if (SCM_LIKELY (len > 0))
1261     idx = scm_to_unsigned_integer (k, 0, len - 1);
1262   else
1263     scm_out_of_range (NULL, k);
1264
1265   if (scm_i_is_narrow_string (str))
1266     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1267   else
1268     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1269 }
1270 #undef FUNC_NAME
1271
1272 SCM
1273 scm_c_string_ref (SCM str, size_t p)
1274 {
1275   if (p >= scm_i_string_length (str))
1276     scm_out_of_range (NULL, scm_from_size_t (p));
1277   if (scm_i_is_narrow_string (str))
1278     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1279   else
1280     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1281
1282 }
1283
1284 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1285             (SCM str, SCM k, SCM chr),
1286             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1287             "an unspecified value. @var{k} must be a valid index of\n"
1288             "@var{str}.")
1289 #define FUNC_NAME s_scm_string_set_x
1290 {
1291   size_t len;
1292   unsigned long idx;
1293
1294   SCM_VALIDATE_STRING (1, str);
1295
1296   len = scm_i_string_length (str);
1297   if (SCM_LIKELY (len > 0))
1298     idx = scm_to_unsigned_integer (k, 0, len - 1);
1299   else
1300     scm_out_of_range (NULL, k);
1301
1302   SCM_VALIDATE_CHAR (3, chr);
1303   str = scm_i_string_start_writing (str);
1304   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1305   scm_i_string_stop_writing ();
1306
1307   return SCM_UNSPECIFIED;
1308 }
1309 #undef FUNC_NAME
1310
1311 void
1312 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1313 {
1314   if (p >= scm_i_string_length (str))
1315     scm_out_of_range (NULL, scm_from_size_t (p));
1316   str = scm_i_string_start_writing (str);
1317   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1318   scm_i_string_stop_writing ();
1319 }
1320
1321 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1322             (SCM str, SCM start, SCM end),
1323             "Return a newly allocated string formed from the characters\n"
1324             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1325             "ending with index @var{end} (exclusive).\n"
1326             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1327             "exact integers satisfying:\n\n"
1328             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1329 #define FUNC_NAME s_scm_substring
1330 {
1331   size_t len, from, to;
1332
1333   SCM_VALIDATE_STRING (1, str);
1334   len = scm_i_string_length (str);
1335   from = scm_to_unsigned_integer (start, 0, len);
1336   if (SCM_UNBNDP (end))
1337     to = len;
1338   else
1339     to = scm_to_unsigned_integer (end, from, len);
1340   return scm_i_substring (str, from, to);
1341 }
1342 #undef FUNC_NAME
1343
1344 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1345             (SCM str, SCM start, SCM end),
1346             "Return a newly allocated string formed from the characters\n"
1347             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1348             "ending with index @var{end} (exclusive).\n"
1349             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1350             "exact integers satisfying:\n"
1351             "\n"
1352             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1353             "\n"
1354             "The returned string is read-only.\n")
1355 #define FUNC_NAME s_scm_substring_read_only
1356 {
1357   size_t len, from, to;
1358
1359   SCM_VALIDATE_STRING (1, str);
1360   len = scm_i_string_length (str);
1361   from = scm_to_unsigned_integer (start, 0, len);
1362   if (SCM_UNBNDP (end))
1363     to = len;
1364   else
1365     to = scm_to_unsigned_integer (end, from, len);
1366   return scm_i_substring_read_only (str, from, to);
1367 }
1368 #undef FUNC_NAME
1369
1370 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1371             (SCM str, SCM start, SCM end),
1372             "Return a newly allocated string formed from the characters\n"
1373             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1374             "ending with index @var{end} (exclusive).\n"
1375             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1376             "exact integers satisfying:\n\n"
1377             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1378 #define FUNC_NAME s_scm_substring_copy
1379 {
1380   /* For the Scheme version, START is mandatory, but for the C
1381      version, it is optional.  See scm_string_copy in srfi-13.c for a
1382      rationale.
1383   */
1384
1385   size_t from, to;
1386
1387   SCM_VALIDATE_STRING (1, str);
1388   scm_i_get_substring_spec (scm_i_string_length (str),
1389                             start, &from, end, &to);
1390   return scm_i_substring_copy (str, from, to);
1391 }
1392 #undef FUNC_NAME
1393
1394 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1395             (SCM str, SCM start, SCM end),
1396             "Return string that indirectly refers to the characters\n"
1397             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1398             "ending with index @var{end} (exclusive).\n"
1399             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1400             "exact integers satisfying:\n\n"
1401             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1402 #define FUNC_NAME s_scm_substring_shared
1403 {
1404   size_t len, from, to;
1405
1406   SCM_VALIDATE_STRING (1, str);
1407   len = scm_i_string_length (str);
1408   from = scm_to_unsigned_integer (start, 0, len);
1409   if (SCM_UNBNDP (end))
1410     to = len;
1411   else
1412     to = scm_to_unsigned_integer (end, from, len);
1413   return scm_i_substring_shared (str, from, to);
1414 }
1415 #undef FUNC_NAME
1416
1417 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1418             (SCM args),
1419             "Return a newly allocated string whose characters form the\n"
1420             "concatenation of the given strings, @var{args}.")
1421 #define FUNC_NAME s_scm_string_append
1422 {
1423   SCM res;
1424   size_t total = 0;
1425   size_t len;
1426   int wide = 0;
1427   SCM l, s;
1428   size_t i;
1429   union
1430   {
1431     char *narrow;
1432     scm_t_wchar *wide;
1433   } data;
1434
1435   SCM_VALIDATE_REST_ARGUMENT (args);
1436   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1437     {
1438       s = SCM_CAR (l);
1439       SCM_VALIDATE_STRING (SCM_ARGn, s);
1440       len = scm_i_string_length (s);
1441       if (((size_t) -1) - total < len)
1442         scm_num_overflow (s_scm_string_append);
1443       total += len;
1444       if (!scm_i_is_narrow_string (s))
1445         wide = 1;
1446     }
1447   data.narrow = NULL;
1448   if (!wide)
1449     res = scm_i_make_string (total, &data.narrow, 0);
1450   else
1451     res = scm_i_make_wide_string (total, &data.wide, 0);
1452
1453   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1454     {
1455       size_t len;
1456       s = SCM_CAR (l);
1457       SCM_VALIDATE_STRING (SCM_ARGn, s);
1458       len = scm_i_string_length (s);
1459       if (len > total)
1460         SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1461       if (!wide)
1462         {
1463           memcpy (data.narrow, scm_i_string_chars (s), len);
1464           data.narrow += len;
1465         }
1466       else
1467         {
1468           if (scm_i_is_narrow_string (s))
1469             {
1470               const char *src = scm_i_string_chars (s);
1471               for (i = 0; i < len; i++)
1472                 data.wide[i] = (unsigned char) src[i];
1473             }
1474           else
1475             u32_cpy ((scm_t_uint32 *) data.wide,
1476                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1477           data.wide += len;
1478         }
1479       total -= len;
1480       scm_remember_upto_here_1 (s);
1481     }
1482   if (total != 0)
1483     SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1484   return res;
1485 }
1486 #undef FUNC_NAME
1487
1488
1489 \f
1490 /* Charset conversion error handling.  */
1491
1492 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1493 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1494
1495 /* Raise an exception informing that character CHR could not be written
1496    to PORT in its current encoding.  */
1497 void
1498 scm_encoding_error (const char *subr, int err, const char *message,
1499                     SCM port, SCM chr)
1500 {
1501   scm_throw (scm_encoding_error_key,
1502              scm_list_n (scm_from_latin1_string (subr),
1503                          scm_from_latin1_string (message),
1504                          scm_from_int (err),
1505                          port, chr,
1506                          SCM_UNDEFINED));
1507 }
1508
1509 /* Raise an exception informing of an encoding error on PORT.  This
1510    means that a character could not be written in PORT's encoding.  */
1511 void
1512 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1513 {
1514   scm_throw (scm_decoding_error_key,
1515              scm_list_n (scm_from_latin1_string (subr),
1516                          scm_from_latin1_string (message),
1517                          scm_from_int (err),
1518                          port,
1519                          SCM_UNDEFINED));
1520 }
1521
1522 \f
1523 /* String conversion to/from C.  */
1524
1525 static void
1526 decoding_error (const char *func_name, int errno_save,
1527                 const char *str, size_t len)
1528 {
1529   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1530      handler.  */
1531   SCM bv;
1532   signed char *buf;
1533
1534   buf = scm_gc_malloc_pointerless (len, "bytevector");
1535   memcpy (buf, str, len);
1536   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1537
1538   scm_decoding_error (func_name, errno_save,
1539                       "input locale conversion error", bv);
1540 }
1541
1542 SCM
1543 scm_from_stringn (const char *str, size_t len, const char *encoding,
1544                   scm_t_string_failed_conversion_handler handler)
1545 {
1546   size_t u32len, i;
1547   scm_t_wchar *u32;
1548   int wide = 0;
1549   SCM res;
1550
1551   /* The order of these checks is important. */
1552   if (!str && len != 0)
1553     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1554   if (len == (size_t) -1)
1555     len = strlen (str);
1556
1557   if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
1558     return scm_from_latin1_stringn (str, len);
1559   else if (c_strcasecmp (encoding, "UTF-8") == 0
1560            && handler == SCM_FAILED_CONVERSION_ERROR)
1561     return scm_from_utf8_stringn (str, len);
1562
1563   u32len = 0;
1564   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1565                                                 (enum iconv_ilseq_handler)
1566                                                 handler,
1567                                                 str, len,
1568                                                 NULL,
1569                                                 NULL, &u32len);
1570
1571   if (SCM_UNLIKELY (u32 == NULL))
1572     decoding_error (__func__, errno, str, len);
1573
1574   i = 0;
1575   while (i < u32len)
1576     if (u32[i++] > 0xFF)
1577       {
1578         wide = 1;
1579         break;
1580       }
1581
1582   if (!wide)
1583     {
1584       char *dst;
1585       res = scm_i_make_string (u32len, &dst, 0);
1586       for (i = 0; i < u32len; i ++)
1587         dst[i] = (unsigned char) u32[i];
1588       dst[u32len] = '\0';
1589     }
1590   else
1591     {
1592       scm_t_wchar *wdst;
1593       res = scm_i_make_wide_string (u32len, &wdst, 0);
1594       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1595       wdst[u32len] = 0;
1596     }
1597
1598   free (u32);
1599   return res;
1600 }
1601
1602 SCM
1603 scm_from_locale_string (const char *str)
1604 {
1605   return scm_from_locale_stringn (str, -1);
1606 }
1607
1608 SCM
1609 scm_from_locale_stringn (const char *str, size_t len)
1610 {
1611   return scm_from_stringn (str, len, locale_charset (),
1612                            scm_i_default_port_conversion_handler ());
1613 }
1614
1615 SCM
1616 scm_from_latin1_string (const char *str)
1617 {
1618   return scm_from_latin1_stringn (str, -1);
1619 }
1620
1621 SCM
1622 scm_from_latin1_stringn (const char *str, size_t len)
1623 {
1624   char *buf;
1625   SCM result;
1626
1627   if (len == (size_t) -1)
1628     len = strlen (str);
1629
1630   /* Make a narrow string and copy STR as is.  */
1631   result = scm_i_make_string (len, &buf, 0);
1632   memcpy (buf, str, len);
1633
1634   return result;
1635 }
1636
1637 SCM
1638 scm_from_utf8_string (const char *str)
1639 {
1640   return scm_from_utf8_stringn (str, -1);
1641 }
1642
1643 SCM
1644 scm_from_utf8_stringn (const char *str, size_t len)
1645 {
1646   size_t i, char_len;
1647   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1648   int ascii = 1, narrow = 1;
1649   SCM res;
1650
1651   if (len == (size_t) -1)
1652     len = strlen (str);
1653
1654   i = 0;
1655   char_len = 0;
1656
1657   while (i < len)
1658     {
1659       if (ustr[i] <= 127)
1660         {
1661           char_len++;
1662           i++;
1663         }
1664       else
1665         {
1666           ucs4_t c;
1667           int nbytes;
1668
1669           ascii = 0;
1670
1671           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1672
1673           if (c == 0xfffd)
1674             /* Bad UTF-8.  */
1675             decoding_error (__func__, errno, str, len);
1676
1677           if (c > 255)
1678             narrow = 0;
1679
1680           char_len++;
1681           i += nbytes;
1682         }
1683     }
1684
1685   if (ascii)
1686     {
1687       char *dst;
1688       res = scm_i_make_string (char_len, &dst, 0);
1689       memcpy (dst, str, len);
1690     }
1691   else if (narrow)
1692     {
1693       char *dst;
1694       size_t j;
1695       ucs4_t c;
1696
1697       res = scm_i_make_string (char_len, &dst, 0);
1698
1699       for (i = 0, j = 0; i < len; j++)
1700         {
1701           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1702           dst[j] = (signed char) c;
1703         }
1704     }
1705   else
1706     {
1707       scm_t_wchar *dst;
1708       size_t j;
1709       ucs4_t c;
1710
1711       res = scm_i_make_wide_string (char_len, &dst, 0);
1712
1713       for (i = 0, j = 0; i < len; j++)
1714         {
1715           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1716           dst[j] = c;
1717         }
1718     }
1719
1720   return res;
1721 }
1722
1723 SCM
1724 scm_from_utf32_string (const scm_t_wchar *str)
1725 {
1726   return scm_from_utf32_stringn (str, -1);
1727 }
1728
1729 SCM
1730 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1731 {
1732   SCM result;
1733   scm_t_wchar *buf;
1734
1735   if (len == (size_t) -1)
1736     len = u32_strlen ((uint32_t *) str);
1737
1738   result = scm_i_make_wide_string (len, &buf, 0);
1739   memcpy (buf, str, len * sizeof (scm_t_wchar));
1740   scm_i_try_narrow_string (result);
1741
1742   return result;
1743 }
1744
1745 SCM
1746 scm_from_port_string (const char *str, SCM port)
1747 {
1748   return scm_from_port_stringn (str, -1, port);
1749 }
1750
1751 SCM
1752 scm_from_port_stringn (const char *str, size_t len, SCM port)
1753 {
1754   scm_t_port *pt = SCM_PTAB_ENTRY (port);
1755   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
1756
1757   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1758     return scm_from_latin1_stringn (str, len);
1759   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1760            && (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
1761                || (u8_check ((uint8_t *) str, len) == NULL)))
1762     return scm_from_utf8_stringn (str, len);
1763   else
1764     return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1765 }
1766
1767 /* Create a new scheme string from the C string STR.  The memory of
1768    STR may be used directly as storage for the new string.  */
1769 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1770    would be to register a finalizer to eventually free(3) STR, which isn't
1771    worth it.  Should we just deprecate the `scm_take_' functions?  */
1772 SCM
1773 scm_take_locale_stringn (char *str, size_t len)
1774 {
1775   SCM res;
1776
1777   res = scm_from_locale_stringn (str, len);
1778   free (str);
1779
1780   return res;
1781 }
1782
1783 SCM
1784 scm_take_locale_string (char *str)
1785 {
1786   return scm_take_locale_stringn (str, -1);
1787 }
1788
1789 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1790    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1791    Set *LENP to the size of the resulting string.
1792
1793    FIXME: This is a hack we should get rid of.  See
1794    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1795    for details.  */
1796 static void
1797 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1798 {
1799   char *before, *after;
1800   size_t i, j;
1801
1802   before = buf;
1803   after = buf;
1804   i = 0;
1805   j = 0;
1806   while (i < *lenp)
1807     {
1808       if ((i <= *lenp - 6)
1809           && before[i] == '\\'
1810           && before[i + 1] == 'u'
1811           && before[i + 2] == '0' && before[i + 3] == '0')
1812         {
1813           /* Convert \u00NN to \xNN */
1814           after[j] = '\\';
1815           after[j + 1] = 'x';
1816           after[j + 2] = tolower ((int) before[i + 4]);
1817           after[j + 3] = tolower ((int) before[i + 5]);
1818           i += 6;
1819           j += 4;
1820         }
1821       else if ((i <= *lenp - 10)
1822                && before[i] == '\\'
1823                && before[i + 1] == 'U'
1824                && before[i + 2] == '0' && before[i + 3] == '0')
1825         {
1826           /* Convert \U00NNNNNN to \UNNNNNN */
1827           after[j] = '\\';
1828           after[j + 1] = 'U';
1829           after[j + 2] = tolower ((int) before[i + 4]);
1830           after[j + 3] = tolower ((int) before[i + 5]);
1831           after[j + 4] = tolower ((int) before[i + 6]);
1832           after[j + 5] = tolower ((int) before[i + 7]);
1833           after[j + 6] = tolower ((int) before[i + 8]);
1834           after[j + 7] = tolower ((int) before[i + 9]);
1835           i += 10;
1836           j += 8;
1837         }
1838       else
1839         {
1840           after[j] = before[i];
1841           i++;
1842           j++;
1843         }
1844     }
1845   *lenp = j;
1846 }
1847
1848 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1849    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1850    of the resulting string.  BUF must be large enough to handle the
1851    worst case when `\uXXXX' escapes (6 characters) are replaced by
1852    `\xXXXX;' (7 characters).  */
1853 static void
1854 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1855 {
1856   char *before, *after;
1857   size_t i, j;
1858   /* The worst case is if the input string contains all 4-digit hex escapes.
1859      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1860   size_t max_out_len = (*lenp * 7) / 6 + 1;
1861   size_t nzeros, ndigits;
1862
1863   before = buf;
1864   after = alloca (max_out_len);
1865   i = 0;
1866   j = 0;
1867   while (i < *lenp)
1868     {
1869       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1870           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1871         {
1872           if (before[i + 1] == 'u')
1873             ndigits = 4;
1874           else if (before[i + 1] == 'U')
1875             ndigits = 8;
1876           else
1877             abort ();
1878
1879           /* Add the R6RS hex escape initial sequence.  */
1880           after[j] = '\\';
1881           after[j + 1] = 'x';
1882
1883           /* Move string positions to the start of the hex numbers.  */
1884           i += 2;
1885           j += 2;
1886
1887           /* Find the number of initial zeros in this hex number.  */
1888           nzeros = 0;
1889           while (before[i + nzeros] == '0' && nzeros < ndigits)
1890             nzeros++;
1891
1892           /* Copy the number, skipping initial zeros, and then move the string
1893              positions.  */
1894           if (nzeros == ndigits)
1895             {
1896               after[j] = '0';
1897               i += ndigits;
1898               j += 1;
1899             }
1900           else
1901             {
1902               int pos;
1903               for (pos = 0; pos < ndigits - nzeros; pos++)
1904                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1905               i += ndigits;
1906               j += (ndigits - nzeros);
1907             }
1908
1909           /* Add terminating semicolon.  */
1910           after[j] = ';';
1911           j++;
1912         }
1913       else
1914         {
1915           after[j] = before[i];
1916           i++;
1917           j++;
1918         }
1919     }
1920   *lenp = j;
1921   memcpy (before, after, j);
1922 }
1923
1924 char *
1925 scm_to_locale_string (SCM str)
1926 {
1927   return scm_to_locale_stringn (str, NULL);
1928 }
1929
1930 char *
1931 scm_to_locale_stringn (SCM str, size_t *lenp)
1932 {
1933   return scm_to_stringn (str, lenp,
1934                          locale_charset (),
1935                          scm_i_default_port_conversion_handler ());
1936 }
1937
1938 char *
1939 scm_to_latin1_string (SCM str)
1940 {
1941   return scm_to_latin1_stringn (str, NULL);
1942 }
1943
1944 char *
1945 scm_to_latin1_stringn (SCM str, size_t *lenp)
1946 #define FUNC_NAME "scm_to_latin1_stringn"
1947 {
1948   char *result;
1949
1950   SCM_VALIDATE_STRING (1, str);
1951
1952   if (scm_i_is_narrow_string (str))
1953     {
1954       size_t len = scm_i_string_length (str);
1955
1956       if (lenp)
1957         *lenp = len;
1958
1959       result = scm_strndup (scm_i_string_data (str), len);
1960     }
1961   else
1962     result = scm_to_stringn (str, lenp, NULL,
1963                              SCM_FAILED_CONVERSION_ERROR);
1964
1965   return result;
1966 }
1967 #undef FUNC_NAME
1968
1969 char *
1970 scm_to_utf8_string (SCM str)
1971 {
1972   return scm_to_utf8_stringn (str, NULL);
1973 }
1974
1975 static size_t
1976 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1977 {
1978   size_t ret, i;
1979   for (i = 0, ret = 0; i < len; i++)
1980     ret += (str[i] < 128) ? 1 : 2;
1981   return ret;
1982 }
1983
1984 static scm_t_uint8*
1985 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1986               scm_t_uint8 *u8_result, size_t *u8_lenp)
1987 {
1988   size_t i, n;
1989   size_t u8_len = latin1_u8_strlen (str, latin_len);
1990
1991   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1992     u8_result = scm_malloc (u8_len + 1);
1993   if (u8_lenp)
1994     *u8_lenp = u8_len;
1995
1996   for (i = 0, n = 0; i < latin_len; i++)
1997     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1998   if (n != u8_len)
1999     abort ();
2000   u8_result[n] = 0;
2001
2002   return u8_result;
2003 }
2004
2005 /* UTF-8 code table
2006
2007    (Note that this includes code points that are not allowed by Unicode,
2008     but since this function has no way to report an error, and its
2009     purpose is to determine the size of destination buffers for
2010     libunicode conversion functions, we err on the safe side and handle
2011     everything that libunicode might conceivably handle, now or in the
2012     future.)
2013
2014    Char. number range  |        UTF-8 octet sequence
2015       (hexadecimal)    |              (binary)
2016    --------------------+------------------------------------------------------
2017    0000 0000-0000 007F | 0xxxxxxx
2018    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2019    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2020    0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2021    0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2022    0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2023 */
2024
2025 static size_t
2026 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2027 {
2028   size_t ret, i;
2029
2030   for (i = 0, ret = 0; i < len; i++)
2031     {
2032       scm_t_uint32 c = str[i];
2033
2034       if (c <= 0x7f)
2035         ret += 1;
2036       else if (c <= 0x7ff)
2037         ret += 2;
2038       else if (c <= 0xffff)
2039         ret += 3;
2040       else if (c <= 0x1fffff)
2041         ret += 4;
2042       else if (c <= 0x3ffffff)
2043         ret += 5;
2044       else
2045         ret += 6;
2046     }
2047
2048   return ret;
2049 }
2050
2051 char *
2052 scm_to_utf8_stringn (SCM str, size_t *lenp)
2053 #define FUNC_NAME "scm_to_utf8_stringn"
2054 {
2055   SCM_VALIDATE_STRING (1, str);
2056
2057   if (scm_i_is_narrow_string (str))
2058     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2059                                   scm_i_string_length (str),
2060                                   NULL, lenp);
2061   else
2062     {
2063       scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2064       scm_t_uint8 *buf, *ret;
2065       size_t num_chars = scm_i_string_length (str);
2066       size_t num_bytes_predicted, num_bytes_actual;
2067
2068       num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2069
2070       if (lenp)
2071         {
2072           *lenp = num_bytes_predicted;
2073           buf = scm_malloc (num_bytes_predicted);
2074         }
2075       else
2076         {
2077           buf = scm_malloc (num_bytes_predicted + 1);
2078           buf[num_bytes_predicted] = 0;
2079         }
2080
2081       num_bytes_actual = num_bytes_predicted;
2082       ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2083
2084       if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2085         return (char *) ret;
2086
2087       /* An error: a bad codepoint.  */
2088       {
2089         int saved_errno = errno;
2090
2091         free (buf);
2092         if (!saved_errno)
2093           abort ();
2094
2095         scm_decoding_error ("scm_to_utf8_stringn", errno,
2096                             "invalid codepoint in string", str);
2097
2098         /* Not reached.  */
2099         return NULL;
2100       }
2101     }
2102 }
2103 #undef FUNC_NAME
2104
2105 scm_t_wchar *
2106 scm_to_utf32_string (SCM str)
2107 {
2108   return scm_to_utf32_stringn (str, NULL);
2109 }
2110
2111 scm_t_wchar *
2112 scm_to_utf32_stringn (SCM str, size_t *lenp)
2113 #define FUNC_NAME "scm_to_utf32_stringn"
2114 {
2115   scm_t_wchar *result;
2116
2117   SCM_VALIDATE_STRING (1, str);
2118
2119   if (scm_i_is_narrow_string (str))
2120     {
2121       scm_t_uint8 *codepoints;
2122       size_t i, len;
2123
2124       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2125       len = scm_i_string_length (str);
2126       if (lenp)
2127         *lenp = len;
2128
2129       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2130       for (i = 0; i < len; i++)
2131         result[i] = codepoints[i];
2132       result[len] = 0;
2133     }
2134   else
2135     {
2136       size_t len;
2137
2138       len = scm_i_string_length (str);
2139       if (lenp)
2140         *lenp = len;
2141
2142       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2143       memcpy (result, scm_i_string_wide_chars (str),
2144               len * sizeof (scm_t_wchar));
2145       result[len] = 0;
2146     }
2147
2148   return result;
2149 }
2150 #undef FUNC_NAME
2151
2152 char *
2153 scm_to_port_string (SCM str, SCM port)
2154 {
2155   return scm_to_port_stringn (str, NULL, port);
2156 }
2157
2158 char *
2159 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2160 {
2161   scm_t_port *pt = SCM_PTAB_ENTRY (port);
2162   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
2163
2164   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2165       && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2166     return scm_to_latin1_stringn (str, lenp);
2167   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2168     return scm_to_utf8_stringn (str, lenp);
2169   else
2170     return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2171 }
2172
2173 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2174    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2175    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2176    defined by HANDLER.  */
2177 char *
2178 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2179                 scm_t_string_failed_conversion_handler handler)
2180 {
2181   char *buf;
2182   size_t ilen, len, i;
2183   int ret;
2184   const char *enc;
2185
2186   if (!scm_is_string (str))
2187     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2188   ilen = scm_i_string_length (str);
2189
2190   if (ilen == 0)
2191     {
2192       buf = scm_malloc (1);
2193       buf[0] = '\0';
2194       if (lenp)
2195         *lenp = 0;
2196       return buf;
2197     }
2198
2199   if (lenp == NULL)
2200     for (i = 0; i < ilen; i++)
2201       if (scm_i_string_ref (str, i) == '\0')
2202         scm_misc_error (NULL,
2203                         "string contains #\\nul character: ~S",
2204                         scm_list_1 (str));
2205
2206   if (scm_i_is_narrow_string (str)
2207       && c_strcasecmp (encoding, "ISO-8859-1") == 0)
2208     {
2209       /* If using native Latin-1 encoding, just copy the string
2210          contents.  */
2211       if (lenp)
2212         {
2213           buf = scm_malloc (ilen);
2214           memcpy (buf, scm_i_string_chars (str), ilen);
2215           *lenp = ilen;
2216           return buf;
2217         }
2218       else
2219         {
2220           buf = scm_malloc (ilen + 1);
2221           memcpy (buf, scm_i_string_chars (str), ilen);
2222           buf[ilen] = '\0';
2223           return buf;
2224         }
2225     }
2226
2227
2228   buf = NULL;
2229   len = 0;
2230   enc = encoding;
2231   if (enc == NULL)
2232     enc = "ISO-8859-1";
2233   if (scm_i_is_narrow_string (str))
2234     {
2235       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2236                          "ISO-8859-1", enc,
2237                          (enum iconv_ilseq_handler) handler, NULL,
2238                          &buf, &len);
2239
2240       if (ret != 0)
2241         scm_encoding_error (__func__, errno,
2242                             "cannot convert narrow string to output locale",
2243                             SCM_BOOL_F,
2244                             /* FIXME: Faulty character unknown.  */
2245                             SCM_BOOL_F);
2246     }
2247   else
2248     {
2249       buf = u32_conv_to_encoding (enc,
2250                                   (enum iconv_ilseq_handler) handler,
2251                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2252                                   ilen,
2253                                   NULL,
2254                                   NULL, &len);
2255       if (buf == NULL)
2256         scm_encoding_error (__func__, errno,
2257                             "cannot convert wide string to output locale",
2258                             SCM_BOOL_F,
2259                             /* FIXME: Faulty character unknown.  */
2260                             SCM_BOOL_F);
2261     }
2262   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2263     {
2264       if (SCM_R6RS_ESCAPES_P)
2265         {
2266           /* The worst case is if the input string contains all 4-digit
2267              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2268              (seven characters).  Make BUF large enough to hold
2269              that.  */
2270           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2271           unistring_escapes_to_r6rs_escapes (buf, &len);
2272         }
2273       else
2274         unistring_escapes_to_guile_escapes (buf, &len);
2275
2276       buf = scm_realloc (buf, len);
2277     }
2278   if (lenp)
2279     *lenp = len;
2280   else
2281     {
2282       buf = scm_realloc (buf, len + 1);
2283       buf[len] = '\0';
2284     }
2285
2286   scm_remember_upto_here_1 (str);
2287   return buf;
2288 }
2289
2290 size_t
2291 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2292 {
2293   size_t len;
2294   char *result = NULL;
2295   if (!scm_is_string (str))
2296     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2297   result = scm_to_locale_stringn (str, &len);
2298
2299   memcpy (buf, result, (len > max_len) ? max_len : len);
2300   free (result);
2301
2302   scm_remember_upto_here_1 (str);
2303   return len;
2304 }
2305
2306 \f
2307 /* Unicode string normalization.  */
2308
2309 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2310    libguile/i18n.c.  It would be useful to have this factored out into a more
2311    convenient location, but its use of alloca makes that tricky to do. */
2312
2313 static SCM
2314 normalize_str (SCM string, uninorm_t form)
2315 {
2316   SCM ret;
2317   scm_t_uint32 *w_str;
2318   scm_t_wchar *cbuf;
2319   size_t rlen, len = scm_i_string_length (string);
2320
2321   if (scm_i_is_narrow_string (string))
2322     {
2323       size_t i;
2324       const char *buf = scm_i_string_chars (string);
2325
2326       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2327
2328       for (i = 0; i < len; i ++)
2329         w_str[i] = (unsigned char) buf[i];
2330       w_str[len] = 0;
2331     }
2332   else
2333     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2334
2335   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2336
2337   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2338   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2339   free (w_str);
2340
2341   scm_i_try_narrow_string (ret);
2342
2343   return ret;
2344 }
2345
2346 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2347             (SCM string),
2348             "Returns the NFC normalized form of @var{string}.")
2349 #define FUNC_NAME s_scm_string_normalize_nfc
2350 {
2351   SCM_VALIDATE_STRING (1, string);
2352   return normalize_str (string, UNINORM_NFC);
2353 }
2354 #undef FUNC_NAME
2355
2356 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2357             (SCM string),
2358             "Returns the NFD normalized form of @var{string}.")
2359 #define FUNC_NAME s_scm_string_normalize_nfd
2360 {
2361   SCM_VALIDATE_STRING (1, string);
2362   return normalize_str (string, UNINORM_NFD);
2363 }
2364 #undef FUNC_NAME
2365
2366 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2367             (SCM string),
2368             "Returns the NFKC normalized form of @var{string}.")
2369 #define FUNC_NAME s_scm_string_normalize_nfkc
2370 {
2371   SCM_VALIDATE_STRING (1, string);
2372   return normalize_str (string, UNINORM_NFKC);
2373 }
2374 #undef FUNC_NAME
2375
2376 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2377             (SCM string),
2378             "Returns the NFKD normalized form of @var{string}.")
2379 #define FUNC_NAME s_scm_string_normalize_nfkd
2380 {
2381   SCM_VALIDATE_STRING (1, string);
2382   return normalize_str (string, UNINORM_NFKD);
2383 }
2384 #undef FUNC_NAME
2385
2386 /* converts C scm_array of strings to SCM scm_list of strings.
2387    If argc < 0, a null terminated scm_array is assumed.
2388    The current locale encoding is assumed */
2389 SCM
2390 scm_makfromstrs (int argc, char **argv)
2391 {
2392   int i = argc;
2393   SCM lst = SCM_EOL;
2394   if (0 > i)
2395     for (i = 0; argv[i]; i++);
2396   while (i--)
2397     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2398   return lst;
2399 }
2400
2401 /* Return a newly allocated array of char pointers to each of the strings
2402    in args, with a terminating NULL pointer.  The strings are encoded using
2403    the current locale. */
2404
2405 char **
2406 scm_i_allocate_string_pointers (SCM list)
2407 #define FUNC_NAME "scm_i_allocate_string_pointers"
2408 {
2409   char **result;
2410   int list_len = scm_ilength (list);
2411   int i;
2412
2413   if (list_len < 0)
2414     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2415
2416   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2417                           "string pointers");
2418   result[list_len] = NULL;
2419
2420   /* The list might have been modified in another thread, so
2421      we check LIST before each access.
2422    */
2423   for (i = 0; i < list_len && scm_is_pair (list); i++)
2424     {
2425       SCM str = SCM_CAR (list);
2426       size_t len;  /* String length in bytes */
2427       char *c_str = scm_to_locale_stringn (str, &len);
2428
2429       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2430          scm_malloc to allocate the returned string, which must be
2431          explicitly deallocated.  This forces us to copy the string a
2432          second time into a new buffer.  Ideally there would be variants
2433          of scm_to_*_stringn that can return garbage-collected buffers. */
2434
2435       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2436       memcpy (result[i], c_str, len);
2437       result[i][len] = '\0';
2438       free (c_str);
2439
2440       list = SCM_CDR (list);
2441     }
2442
2443   return result;
2444 }
2445 #undef FUNC_NAME
2446
2447 void
2448 scm_i_get_substring_spec (size_t len,
2449                           SCM start, size_t *cstart,
2450                           SCM end, size_t *cend)
2451 {
2452   if (SCM_UNBNDP (start))
2453     *cstart = 0;
2454   else
2455     *cstart = scm_to_unsigned_integer (start, 0, len);
2456
2457   if (SCM_UNBNDP (end))
2458     *cend = len;
2459   else
2460     *cend = scm_to_unsigned_integer (end, *cstart, len);
2461 }
2462
2463 static SCM
2464 string_handle_ref (scm_t_array_handle *h, size_t index)
2465 {
2466   return scm_c_string_ref (h->array, index);
2467 }
2468
2469 static void
2470 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2471 {
2472   scm_c_string_set_x (h->array, index, val);
2473 }
2474
2475 static void
2476 string_get_handle (SCM v, scm_t_array_handle *h)
2477 {
2478   h->array = v;
2479   h->ndims = 1;
2480   h->dims = &h->dim0;
2481   h->dim0.lbnd = 0;
2482   h->dim0.ubnd = scm_c_string_length (v) - 1;
2483   h->dim0.inc = 1;
2484   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2485   h->elements = h->writable_elements = NULL;
2486 }
2487
2488 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2489                           string_handle_ref, string_handle_set,
2490                           string_get_handle)
2491 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2492
2493 void
2494 scm_init_strings ()
2495 {
2496   scm_nullstr = scm_i_make_string (0, NULL, 0);
2497
2498 #include "libguile/strings.x"
2499 }
2500
2501
2502 /*
2503   Local Variables:
2504   c-file-style: "gnu"
2505   End:
2506 */