libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32 #include <c-strcase.h>
  33
  34 #include "striconveh.h"
  35
  36 #include "libguile/_scm.h"
  37 #include "libguile/chars.h"
  38 #include "libguile/root.h"
  39 #include "libguile/strings.h"
  40 #include "libguile/ports.h"
  41 #include "libguile/ports-internal.h"
  42 #include "libguile/error.h"
  43 #include "libguile/generalized-vectors.h"
  44 #include "libguile/deprecation.h"
  45 #include "libguile/validate.h"
  46 #include "libguile/private-options.h"
  47
  48 \f
  49
  50 /* {Strings}
  51  */
  52
  53
  54 /* Stringbufs
  55  *
  56  * XXX - keeping an accurate refcount during GC seems to be quite
  57  * tricky, so we just keep score of whether a stringbuf might be
  58  * shared, not whether it definitely is.
  59  *
  60  * The scheme I (mvo) tried to keep an accurate reference count would
  61  * recount all strings that point to a stringbuf during the mark-phase
  62  * of the GC.  This was done since one cannot access the stringbuf of
  63  * a string when that string is freed (in order to decrease the
  64  * reference count).  The memory of the stringbuf might have been
  65  * reused already for something completely different.
  66  *
  67  * This recounted worked for a small number of threads beating on
  68  * cow-strings, but it failed randomly with more than 10 threads, say.
  69  * I couldn't figure out what went wrong, so I used the conservative
  70  * approach implemented below.
  71  *
  72  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  73  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  74  * strings.
  75  */
  76
  77 /* The size in words of the stringbuf header (type tag + size).  */
  78 #define STRINGBUF_HEADER_SIZE   2U
  79
  80 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  81
  82 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  83 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  84
  85 #define STRINGBUF_TAG           scm_tc7_stringbuf
  86 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  87 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  88
  89 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  90                                  SCM_CELL_OBJECT_LOC (buf,              \
  91                                                       STRINGBUF_HEADER_SIZE))
  92 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  93 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  94
  95 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  96
  97 #define SET_STRINGBUF_SHARED(buf)                                       \
  98   do                                                                    \
  99     {                                                                   \
 100       /* Don't modify BUF if it's already marked as shared since it might be \
 101          a read-only, statically allocated stringbuf.  */               \
 102       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 103         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 104     }                                                                   \
 105   while (0)
 106
 107 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 108 static size_t lenhist[1001];
 109 #endif
 110
 111 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 112    characters. */
 113 static SCM
 114 make_stringbuf (size_t len)
 115 {
 116   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 117      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 118      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 119      has been changed for scm_i_symbol_chars, this null-termination
 120      can be dropped.
 121   */
 122
 123   SCM buf;
 124
 125 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 126   if (len < 1000)
 127     lenhist[len]++;
 128   else
 129     lenhist[1000]++;
 130 #endif
 131
 132   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 133                                             "string"));
 134
 135   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 136   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 137
 138   STRINGBUF_CHARS (buf)[len] = 0;
 139
 140   return buf;
 141 }
 142
 143 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 144    characters.  */
 145 static SCM
 146 make_wide_stringbuf (size_t len)
 147 {
 148   SCM buf;
 149   size_t raw_len;
 150
 151 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 152   if (len < 1000)
 153     lenhist[len]++;
 154   else
 155     lenhist[1000]++;
 156 #endif
 157
 158   raw_len = (len + 1) * sizeof (scm_t_wchar);
 159   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 160                                             "string"));
 161
 162   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 163   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 164
 165   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 166
 167   return buf;
 168 }
 169
 170 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 171    characters from BUF.  */
 172 static SCM
 173 wide_stringbuf (SCM buf)
 174 {
 175   SCM new_buf;
 176
 177   if (STRINGBUF_WIDE (buf))
 178     new_buf = buf;
 179   else
 180     {
 181       size_t i, len;
 182       scm_t_wchar *mem;
 183
 184       len = STRINGBUF_LENGTH (buf);
 185
 186       new_buf = make_wide_stringbuf (len);
 187
 188       mem = STRINGBUF_WIDE_CHARS (new_buf);
 189       for (i = 0; i < len; i++)
 190         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 191       mem[len] = 0;
 192     }
 193
 194   return new_buf;
 195 }
 196
 197 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 198    characters from BUF, if possible.  */
 199 static SCM
 200 narrow_stringbuf (SCM buf)
 201 {
 202   SCM new_buf;
 203
 204   if (!STRINGBUF_WIDE (buf))
 205     new_buf = buf;
 206   else
 207     {
 208       size_t i, len;
 209       scm_t_wchar *wmem;
 210       unsigned char *mem;
 211
 212       len = STRINGBUF_LENGTH (buf);
 213       wmem = STRINGBUF_WIDE_CHARS (buf);
 214
 215       for (i = 0; i < len; i++)
 216         if (wmem[i] > 0xFF)
 217           /* BUF cannot be narrowed.  */
 218           return buf;
 219
 220       new_buf = make_stringbuf (len);
 221
 222       mem = STRINGBUF_CHARS (new_buf);
 223       for (i = 0; i < len; i++)
 224         mem[i] = (unsigned char) wmem[i];
 225       mem[len] = 0;
 226     }
 227
 228   return new_buf;
 229 }
 230
 231 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 232
 233 \f
 234 /* Copy-on-write strings.
 235  */
 236
 237 #define STRING_TAG            scm_tc7_string
 238
 239 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 240 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 241 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 242
 243 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 244 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 245
 246 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 247
 248 /* Read-only strings.
 249  */
 250
 251 #define RO_STRING_TAG         scm_tc7_ro_string
 252 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 253
 254 /* Mutation-sharing substrings
 255  */
 256
 257 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 258
 259 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 260 /* START and LENGTH as for STRINGs. */
 261
 262 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 263
 264 void
 265 scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate)
 266 {
 267   SCM str;
 268
 269   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 270   SET_STRINGBUF_SHARED (exp);
 271   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 272
 273   str =  scm_double_cell (RO_STRING_TAG, SCM_UNPACK(exp),
 274                           0, STRINGBUF_LENGTH (exp));
 275
 276   scm_puts ("#<stringbuf ", port);
 277   scm_iprin1 (str, port, pstate);
 278   scm_puts (">", port);
 279 }
 280
 281 SCM scm_nullstr;
 282
 283 static SCM null_stringbuf;
 284
 285 static void
 286 init_null_stringbuf (void)
 287 {
 288   null_stringbuf = make_stringbuf (0);
 289   SET_STRINGBUF_SHARED (null_stringbuf);
 290 }
 291
 292 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 293    characters.  CHARSP, if not NULL, will be set to location of the
 294    char array.  If READ_ONLY_P, the returned string is read-only;
 295    otherwise it is writable.  */
 296 SCM
 297 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 298 {
 299   SCM buf;
 300   SCM res;
 301
 302   if (len == 0)
 303     {
 304       static scm_i_pthread_once_t once = SCM_I_PTHREAD_ONCE_INIT;
 305       scm_i_pthread_once (&once, init_null_stringbuf);
 306       buf = null_stringbuf;
 307     }
 308   else
 309     buf = make_stringbuf (len);
 310
 311   if (charsp)
 312     *charsp = (char *) STRINGBUF_CHARS (buf);
 313   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 314                          SCM_UNPACK (buf),
 315                          (scm_t_bits) 0, (scm_t_bits) len);
 316   return res;
 317 }
 318
 319 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 320    characters.  CHARSP, if not NULL, will be set to location of the
 321    character array.  If READ_ONLY_P, the returned string is read-only;
 322    otherwise it is writable.  */
 323 SCM
 324 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 325 {
 326   SCM buf = make_wide_stringbuf (len);
 327   SCM res;
 328   if (charsp)
 329     *charsp = STRINGBUF_WIDE_CHARS (buf);
 330   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 331                          SCM_UNPACK (buf),
 332                          (scm_t_bits) 0, (scm_t_bits) len);
 333   return res;
 334 }
 335
 336 static void
 337 validate_substring_args (SCM str, size_t start, size_t end)
 338 {
 339   if (!IS_STRING (str))
 340     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 341   if (start > STRING_LENGTH (str))
 342     scm_out_of_range (NULL, scm_from_size_t (start));
 343   if (end > STRING_LENGTH (str) || end < start)
 344     scm_out_of_range (NULL, scm_from_size_t (end));
 345 }
 346
 347 static inline void
 348 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 349 {
 350   *start = STRING_START (*str);
 351   if (IS_SH_STRING (*str))
 352     {
 353       *str = SH_STRING_STRING (*str);
 354       *start += STRING_START (*str);
 355     }
 356   *buf = STRING_STRINGBUF (*str);
 357 }
 358
 359 SCM
 360 scm_i_substring (SCM str, size_t start, size_t end)
 361 {
 362   if (start == end)
 363     return scm_i_make_string (0, NULL, 0);
 364   else
 365     {
 366       SCM buf;
 367       size_t str_start;
 368       get_str_buf_start (&str, &buf, &str_start);
 369       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 370       SET_STRINGBUF_SHARED (buf);
 371       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 372       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 373                               (scm_t_bits)str_start + start,
 374                               (scm_t_bits) end - start);
 375     }
 376 }
 377
 378 SCM
 379 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 380 {
 381   if (start == end)
 382     return scm_i_make_string (0, NULL, 1);
 383   else
 384     {
 385       SCM buf;
 386       size_t str_start;
 387       get_str_buf_start (&str, &buf, &str_start);
 388       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 389       SET_STRINGBUF_SHARED (buf);
 390       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 391       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 392                               (scm_t_bits)str_start + start,
 393                               (scm_t_bits) end - start);
 394     }
 395 }
 396
 397 SCM
 398 scm_i_substring_copy (SCM str, size_t start, size_t end)
 399 {
 400   if (start == end)
 401     return scm_i_make_string (0, NULL, 0);
 402   else
 403     {
 404       size_t len = end - start;
 405       SCM buf, my_buf, substr;
 406       size_t str_start;
 407       int wide = 0;
 408       get_str_buf_start (&str, &buf, &str_start);
 409       if (scm_i_is_narrow_string (str))
 410         {
 411           my_buf = make_stringbuf (len);
 412           memcpy (STRINGBUF_CHARS (my_buf),
 413                   STRINGBUF_CHARS (buf) + str_start + start, len);
 414         }
 415       else
 416         {
 417           my_buf = make_wide_stringbuf (len);
 418           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 419                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 420                                      + start), len);
 421           wide = 1;
 422         }
 423       scm_remember_upto_here_1 (buf);
 424       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 425                                 (scm_t_bits) 0, (scm_t_bits) len);
 426       if (wide)
 427         scm_i_try_narrow_string (substr);
 428       return substr;
 429     }
 430 }
 431
 432 SCM
 433 scm_i_substring_shared (SCM str, size_t start, size_t end)
 434 {
 435   if (start == 0 && end == STRING_LENGTH (str))
 436     return str;
 437   else if (start == end)
 438     return scm_i_make_string (0, NULL, 0);
 439   else
 440     {
 441       size_t len = end - start;
 442       if (IS_SH_STRING (str))
 443         {
 444           start += STRING_START (str);
 445           str = SH_STRING_STRING (str);
 446         }
 447       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 448                               (scm_t_bits)start, (scm_t_bits) len);
 449     }
 450 }
 451
 452 SCM
 453 scm_c_substring (SCM str, size_t start, size_t end)
 454 {
 455   validate_substring_args (str, start, end);
 456   return scm_i_substring (str, start, end);
 457 }
 458
 459 SCM
 460 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 461 {
 462   validate_substring_args (str, start, end);
 463   return scm_i_substring_read_only (str, start, end);
 464 }
 465
 466 SCM
 467 scm_c_substring_copy (SCM str, size_t start, size_t end)
 468 {
 469   validate_substring_args (str, start, end);
 470   return scm_i_substring_copy (str, start, end);
 471 }
 472
 473 SCM
 474 scm_c_substring_shared (SCM str, size_t start, size_t end)
 475 {
 476   validate_substring_args (str, start, end);
 477   return scm_i_substring_shared (str, start, end);
 478 }
 479
 480 \f
 481 /* Internal accessors
 482  */
 483
 484 /* Returns the number of characters in STR.  This may be different
 485    than the memory size of the string storage.  */
 486 size_t
 487 scm_i_string_length (SCM str)
 488 {
 489   return STRING_LENGTH (str);
 490 }
 491
 492 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 493    encoding.  False if it is 'wide', having a 32-bit UCS-4
 494    encoding.  */
 495 int
 496 scm_i_is_narrow_string (SCM str)
 497 {
 498   if (IS_SH_STRING (str))
 499     str = SH_STRING_STRING (str);
 500
 501   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 502 }
 503
 504 /* Try to coerce a string to be narrow.  It if is narrow already, do
 505    nothing.  If it is wide, shrink it to narrow if none of its
 506    characters are above 0xFF.  Return true if the string is narrow or
 507    was made to be narrow.  */
 508 int
 509 scm_i_try_narrow_string (SCM str)
 510 {
 511   if (IS_SH_STRING (str))
 512     str = SH_STRING_STRING (str);
 513
 514   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 515
 516   return scm_i_is_narrow_string (str);
 517 }
 518
 519 /* Return a pointer to the raw data of the string, which can be either Latin-1
 520    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 521 const void *
 522 scm_i_string_data (SCM str)
 523 {
 524   SCM buf;
 525   size_t start;
 526   const char *data;
 527
 528   get_str_buf_start (&str, &buf, &start);
 529
 530   data = STRINGBUF_CONTENTS (buf);
 531   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 532
 533   return data;
 534 }
 535
 536 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 537    STR.  */
 538 const char *
 539 scm_i_string_chars (SCM str)
 540 {
 541   SCM buf;
 542   size_t start;
 543   get_str_buf_start (&str, &buf, &start);
 544   if (scm_i_is_narrow_string (str))
 545     return (const char *) STRINGBUF_CHARS (buf) + start;
 546   else
 547     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 548                     scm_list_1 (str));
 549   return NULL;
 550 }
 551
 552 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 553    STR.  */
 554 const scm_t_wchar *
 555 scm_i_string_wide_chars (SCM str)
 556 {
 557   SCM buf;
 558   size_t start;
 559
 560   get_str_buf_start (&str, &buf, &start);
 561   if (!scm_i_is_narrow_string (str))
 562     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 563   else
 564     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 565                     scm_list_1 (str));
 566 }
 567
 568 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 569    a new string buffer, so that it can be modified without modifying
 570    other strings.  Also, lock the string mutex.  Later, one must call
 571    scm_i_string_stop_writing to unlock the mutex.  */
 572 SCM
 573 scm_i_string_start_writing (SCM orig_str)
 574 {
 575   SCM buf, str = orig_str;
 576   size_t start;
 577
 578   get_str_buf_start (&str, &buf, &start);
 579   if (IS_RO_STRING (str))
 580     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 581
 582   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 583   if (STRINGBUF_SHARED (buf))
 584     {
 585       /* Clone the stringbuf.  */
 586       size_t len = STRING_LENGTH (str);
 587       SCM new_buf;
 588
 589       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 590
 591       if (scm_i_is_narrow_string (str))
 592         {
 593           new_buf = make_stringbuf (len);
 594           memcpy (STRINGBUF_CHARS (new_buf),
 595                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 596
 597         }
 598       else
 599         {
 600           new_buf = make_wide_stringbuf (len);
 601           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 602                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 603                                      + STRING_START (str)), len);
 604         }
 605
 606       SET_STRING_STRINGBUF (str, new_buf);
 607       start -= STRING_START (str);
 608
 609       /* FIXME: The following operations are not atomic, so other threads
 610          looking at STR may see an inconsistent state.  Nevertheless it can't
 611          hurt much since (i) accessing STR while it is being mutated can't
 612          yield a crash, and (ii) concurrent accesses to STR should be
 613          protected by a mutex at the application level.  The latter may not
 614          apply when STR != ORIG_STR, though.  */
 615       SET_STRING_START (str, 0);
 616       SET_STRING_STRINGBUF (str, new_buf);
 617
 618       buf = new_buf;
 619
 620       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 621     }
 622   return orig_str;
 623 }
 624
 625 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 626 char *
 627 scm_i_string_writable_chars (SCM str)
 628 {
 629   SCM buf;
 630   size_t start;
 631
 632   get_str_buf_start (&str, &buf, &start);
 633   if (scm_i_is_narrow_string (str))
 634     return (char *) STRINGBUF_CHARS (buf) + start;
 635   else
 636     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 637                     scm_list_1 (str));
 638   return NULL;
 639 }
 640
 641 /* Return a pointer to the UCS-4 codepoints of a string.  */
 642 static scm_t_wchar *
 643 scm_i_string_writable_wide_chars (SCM str)
 644 {
 645   SCM buf;
 646   size_t start;
 647
 648   get_str_buf_start (&str, &buf, &start);
 649   if (!scm_i_is_narrow_string (str))
 650     return STRINGBUF_WIDE_CHARS (buf) + start;
 651   else
 652     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 653                     scm_list_1 (str));
 654 }
 655
 656 /* Unlock the string mutex that was locked when
 657    scm_i_string_start_writing was called.  */
 658 void
 659 scm_i_string_stop_writing (void)
 660 {
 661   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 662 }
 663
 664 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 665 scm_t_wchar
 666 scm_i_string_ref (SCM str, size_t x)
 667 {
 668   if (scm_i_is_narrow_string (str))
 669     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 670   else
 671     return scm_i_string_wide_chars (str)[x];
 672 }
 673
 674 /* Returns index+1 of the first char in STR that matches C, or
 675    0 if the char is not found.  */
 676 int
 677 scm_i_string_contains_char (SCM str, char ch)
 678 {
 679   size_t i;
 680   size_t len = scm_i_string_length (str);
 681
 682   i = 0;
 683   if (scm_i_is_narrow_string (str))
 684     {
 685       while (i < len)
 686         {
 687           if (scm_i_string_chars (str)[i] == ch)
 688             return i+1;
 689           i++;
 690         }
 691     }
 692   else
 693     {
 694       while (i < len)
 695         {
 696           if (scm_i_string_wide_chars (str)[i]
 697               == (unsigned char) ch)
 698             return i+1;
 699           i++;
 700         }
 701     }
 702   return 0;
 703 }
 704
 705 int
 706 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 707 {
 708   if (scm_i_is_narrow_string (sstr))
 709     {
 710       const char *a = scm_i_string_chars (sstr) + start_x;
 711       const char *b = cstr;
 712       return strncmp (a, b, strlen(b));
 713     }
 714   else
 715     {
 716       size_t i;
 717       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 718       const char *b = cstr;
 719       for (i = 0; i < strlen (b); i++)
 720         {
 721           if (a[i] != (unsigned char) b[i])
 722             return 1;
 723         }
 724     }
 725   return 0;
 726 }
 727
 728 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 729 void
 730 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 731 {
 732   if (IS_SH_STRING (str))
 733     {
 734       p += STRING_START (str);
 735       str = SH_STRING_STRING (str);
 736     }
 737
 738   if (chr > 0xFF && scm_i_is_narrow_string (str))
 739     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 740
 741   if (scm_i_is_narrow_string (str))
 742     {
 743       char *dst = scm_i_string_writable_chars (str);
 744       dst[p] = chr;
 745     }
 746   else
 747     {
 748       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 749       dst[p] = chr;
 750     }
 751 }
 752
 753 \f
 754 /* Symbols.
 755
 756    Basic symbol creation and accessing is done here, the rest is in
 757    symbols.[hc].  This has been done to keep stringbufs and the
 758    internals of strings and string-like objects confined to this file.
 759 */
 760
 761 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 762
 763 SCM
 764 scm_i_make_symbol (SCM name, scm_t_bits flags,
 765                    unsigned long hash, SCM props)
 766 {
 767   SCM buf;
 768   size_t start = STRING_START (name);
 769   size_t length = STRING_LENGTH (name);
 770
 771   if (IS_SH_STRING (name))
 772     {
 773       name = SH_STRING_STRING (name);
 774       start += STRING_START (name);
 775     }
 776   buf = STRING_STRINGBUF (name);
 777
 778   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 779     {
 780       /* reuse buf. */
 781       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 782       SET_STRINGBUF_SHARED (buf);
 783       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 784     }
 785   else
 786     {
 787       /* make new buf. */
 788       if (scm_i_is_narrow_string (name))
 789         {
 790           SCM new_buf = make_stringbuf (length);
 791           memcpy (STRINGBUF_CHARS (new_buf),
 792                   STRINGBUF_CHARS (buf) + start, length);
 793           buf = new_buf;
 794         }
 795       else
 796         {
 797           SCM new_buf = make_wide_stringbuf (length);
 798           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 799                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 800                    length);
 801           buf = new_buf;
 802         }
 803     }
 804   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 805                           (scm_t_bits) hash, SCM_UNPACK (props));
 806 }
 807
 808 SCM
 809 scm_i_c_make_symbol (const char *name, size_t len,
 810                      scm_t_bits flags, unsigned long hash, SCM props)
 811 {
 812   SCM buf = make_stringbuf (len);
 813   memcpy (STRINGBUF_CHARS (buf), name, len);
 814
 815   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 816                           (scm_t_bits) hash, SCM_UNPACK (props));
 817 }
 818
 819 /* Returns the number of characters in SYM.  This may be different
 820    from the memory size of SYM.  */
 821 size_t
 822 scm_i_symbol_length (SCM sym)
 823 {
 824   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 825 }
 826
 827 size_t
 828 scm_c_symbol_length (SCM sym)
 829 #define FUNC_NAME "scm_c_symbol_length"
 830 {
 831   SCM_VALIDATE_SYMBOL (1, sym);
 832
 833   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 834 }
 835 #undef FUNC_NAME
 836
 837 /* True if the name of SYM is stored as a Latin-1 encoded string.
 838    False if it is stored as a 32-bit UCS-4-encoded string.  */
 839 int
 840 scm_i_is_narrow_symbol (SCM sym)
 841 {
 842   SCM buf;
 843
 844   buf = SYMBOL_STRINGBUF (sym);
 845   return !STRINGBUF_WIDE (buf);
 846 }
 847
 848 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 849    contains the name of SYM.  */
 850 const char *
 851 scm_i_symbol_chars (SCM sym)
 852 {
 853   SCM buf;
 854
 855   buf = SYMBOL_STRINGBUF (sym);
 856   if (!STRINGBUF_WIDE (buf))
 857     return (const char *) STRINGBUF_CHARS (buf);
 858   else
 859     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 860                     scm_list_1 (sym));
 861 }
 862
 863 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 864    symbol's name.  */
 865 const scm_t_wchar *
 866 scm_i_symbol_wide_chars (SCM sym)
 867 {
 868   SCM buf;
 869
 870   buf = SYMBOL_STRINGBUF (sym);
 871   if (STRINGBUF_WIDE (buf))
 872     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 873   else
 874     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 875                     scm_list_1 (sym));
 876 }
 877
 878 SCM
 879 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 880 {
 881   SCM buf = SYMBOL_STRINGBUF (sym);
 882   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 883   SET_STRINGBUF_SHARED (buf);
 884   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 885   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 886                           (scm_t_bits)start, (scm_t_bits) end - start);
 887 }
 888
 889 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 890 scm_t_wchar
 891 scm_i_symbol_ref (SCM sym, size_t x)
 892 {
 893   if (scm_i_is_narrow_symbol (sym))
 894     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 895   else
 896     return scm_i_symbol_wide_chars (sym)[x];
 897 }
 898
 899 /* Debugging
 900  */
 901
 902 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 903             "Returns an association list containing debugging information\n"
 904             "for @var{str}. The association list has the following entries."
 905             "@table @code\n"
 906             "@item string\n"
 907             "The string itself.\n"
 908             "@item start\n"
 909             "The start index of the string into its stringbuf\n"
 910             "@item length\n"
 911             "The length of the string\n"
 912             "@item shared\n"
 913             "If this string is a substring, it returns its parent string.\n"
 914             "Otherwise, it returns @code{#f}\n"
 915             "@item read-only\n"
 916             "@code{#t} if the string is read-only\n"
 917             "@item stringbuf-chars\n"
 918             "A new string containing this string's stringbuf's characters\n"
 919             "@item stringbuf-length\n"
 920             "The number of characters in this stringbuf\n"
 921             "@item stringbuf-shared\n"
 922             "@code{#t} if this stringbuf is shared\n"
 923             "@item stringbuf-wide\n"
 924             "@code{#t} if this stringbuf's characters are stored in a\n"
 925             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 926             "buffer\n"
 927             "@end table")
 928 #define FUNC_NAME s_scm_sys_string_dump
 929 {
 930   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 931   SCM buf;
 932   SCM_VALIDATE_STRING (1, str);
 933
 934   /* String info */
 935   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 936                  str);
 937   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 938                  scm_from_size_t (STRING_START (str)));
 939   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 940                  scm_from_size_t (STRING_LENGTH (str)));
 941
 942   if (IS_SH_STRING (str))
 943     {
 944       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 945                      SH_STRING_STRING (str));
 946       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 947     }
 948   else
 949     {
 950       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 951                      SCM_BOOL_F);
 952       buf = STRING_STRINGBUF (str);
 953     }
 954
 955   if (IS_RO_STRING (str))
 956     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 957                    SCM_BOOL_T);
 958   else
 959     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 960                    SCM_BOOL_F);
 961
 962   /* Stringbuf info */
 963   if (!STRINGBUF_WIDE (buf))
 964     {
 965       size_t len = STRINGBUF_LENGTH (buf);
 966       char *cbuf;
 967       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 968       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 969       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 970                      sbc);
 971     }
 972   else
 973     {
 974       size_t len = STRINGBUF_LENGTH (buf);
 975       scm_t_wchar *cbuf;
 976       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 977       u32_cpy ((scm_t_uint32 *) cbuf,
 978                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 979       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 980                      sbc);
 981     }
 982   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 983                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 984   if (STRINGBUF_SHARED (buf))
 985     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 986                    SCM_BOOL_T);
 987   else
 988     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 989                    SCM_BOOL_F);
 990   if (STRINGBUF_WIDE (buf))
 991     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 992                    SCM_BOOL_T);
 993   else
 994     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 995                    SCM_BOOL_F);
 996
 997   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 998 }
 999 #undef FUNC_NAME
1000
1001 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
1002             "Returns an association list containing debugging information\n"
1003             "for @var{sym}. The association list has the following entries."
1004             "@table @code\n"
1005             "@item symbol\n"
1006             "The symbol itself\n"
1007             "@item hash\n"
1008             "Its hash value\n"
1009             "@item interned\n"
1010             "@code{#t} if it is an interned symbol\n"
1011             "@item stringbuf-chars\n"
1012             "A new string containing this symbols's stringbuf's characters\n"
1013             "@item stringbuf-length\n"
1014             "The number of characters in this stringbuf\n"
1015             "@item stringbuf-shared\n"
1016             "@code{#t} if this stringbuf is shared\n"
1017             "@item stringbuf-wide\n"
1018             "@code{#t} if this stringbuf's characters are stored in a\n"
1019             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1020             "buffer\n"
1021             "@end table")
1022 #define FUNC_NAME s_scm_sys_symbol_dump
1023 {
1024   SCM e1, e2, e3, e4, e5, e6, e7;
1025   SCM buf;
1026   SCM_VALIDATE_SYMBOL (1, sym);
1027   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1028                  sym);
1029   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1030                  scm_from_ulong (scm_i_symbol_hash (sym)));
1031   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1032                  scm_symbol_interned_p (sym));
1033   buf = SYMBOL_STRINGBUF (sym);
1034
1035   /* Stringbuf info */
1036   if (!STRINGBUF_WIDE (buf))
1037     {
1038       size_t len = STRINGBUF_LENGTH (buf);
1039       char *cbuf;
1040       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1041       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1042       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1043                      sbc);
1044     }
1045   else
1046     {
1047       size_t len = STRINGBUF_LENGTH (buf);
1048       scm_t_wchar *cbuf;
1049       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1050       u32_cpy ((scm_t_uint32 *) cbuf,
1051                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1052       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1053                      sbc);
1054     }
1055   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1056                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1057   if (STRINGBUF_SHARED (buf))
1058     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1059                    SCM_BOOL_T);
1060   else
1061     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1062                    SCM_BOOL_F);
1063   if (STRINGBUF_WIDE (buf))
1064     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1065                     SCM_BOOL_T);
1066   else
1067     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1068                     SCM_BOOL_F);
1069   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1070
1071 }
1072 #undef FUNC_NAME
1073
1074 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1075
1076 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1077 #define FUNC_NAME s_scm_sys_stringbuf_hist
1078 {
1079   int i;
1080   for (i = 0; i < 1000; i++)
1081     if (lenhist[i])
1082       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1083   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1084   return SCM_UNSPECIFIED;
1085 }
1086 #undef FUNC_NAME
1087
1088 #endif
1089
1090 \f
1091
1092 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1093             (SCM obj),
1094             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1095 #define FUNC_NAME s_scm_string_p
1096 {
1097   return scm_from_bool (IS_STRING (obj));
1098 }
1099 #undef FUNC_NAME
1100
1101
1102 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1103
1104 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1105             (SCM chrs),
1106             "@deffnx {Scheme Procedure} list->string chrs\n"
1107             "Return a newly allocated string composed of the arguments,\n"
1108             "@var{chrs}.")
1109 #define FUNC_NAME s_scm_string
1110 {
1111   SCM result = SCM_BOOL_F;
1112   SCM rest;
1113   size_t len;
1114   size_t p = 0;
1115   long i;
1116   int wide = 0;
1117
1118   /* Verify that this is a list of chars.  */
1119   i = scm_ilength (chrs);
1120   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1121
1122   len = (size_t) i;
1123   rest = chrs;
1124
1125   while (len > 0 && scm_is_pair (rest))
1126     {
1127       SCM elt = SCM_CAR (rest);
1128       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1129       if (SCM_CHAR (elt) > 0xFF)
1130         wide = 1;
1131       rest = SCM_CDR (rest);
1132       len--;
1133       scm_remember_upto_here_1 (elt);
1134     }
1135
1136   /* Construct a string containing this list of chars.  */
1137   len = (size_t) i;
1138   rest = chrs;
1139
1140   if (wide == 0)
1141     {
1142       char *buf;
1143
1144       result = scm_i_make_string (len, NULL, 0);
1145       result = scm_i_string_start_writing (result);
1146       buf = scm_i_string_writable_chars (result);
1147       while (len > 0 && scm_is_pair (rest))
1148         {
1149           SCM elt = SCM_CAR (rest);
1150           buf[p] = (unsigned char) SCM_CHAR (elt);
1151           p++;
1152           rest = SCM_CDR (rest);
1153           len--;
1154           scm_remember_upto_here_1 (elt);
1155         }
1156     }
1157   else
1158     {
1159       scm_t_wchar *buf;
1160
1161       result = scm_i_make_wide_string (len, NULL, 0);
1162       result = scm_i_string_start_writing (result);
1163       buf = scm_i_string_writable_wide_chars (result);
1164       while (len > 0 && scm_is_pair (rest))
1165         {
1166           SCM elt = SCM_CAR (rest);
1167           buf[p] = SCM_CHAR (elt);
1168           p++;
1169           rest = SCM_CDR (rest);
1170           len--;
1171           scm_remember_upto_here_1 (elt);
1172         }
1173     }
1174   scm_i_string_stop_writing ();
1175
1176   if (len > 0)
1177     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1178   if (!scm_is_null (rest))
1179     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1180
1181   return result;
1182 }
1183 #undef FUNC_NAME
1184
1185 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1186             (SCM k, SCM chr),
1187             "Return a newly allocated string of\n"
1188             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1189             "the string are initialized to @var{chr}, otherwise the contents\n"
1190             "of the string are all set to @code{#\nul}.")
1191 #define FUNC_NAME s_scm_make_string
1192 {
1193   return scm_c_make_string (scm_to_size_t (k), chr);
1194 }
1195 #undef FUNC_NAME
1196
1197 SCM
1198 scm_c_make_string (size_t len, SCM chr)
1199 #define FUNC_NAME NULL
1200 {
1201   size_t p;
1202   char *contents = NULL;
1203   SCM res = scm_i_make_string (len, &contents, 0);
1204
1205   /* If no char is given, initialize string contents to NULL.  */
1206   if (SCM_UNBNDP (chr))
1207     memset (contents, 0, len);
1208   else
1209     {
1210       SCM_VALIDATE_CHAR (0, chr);
1211       res = scm_i_string_start_writing (res);
1212       for (p = 0; p < len; p++)
1213         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1214       scm_i_string_stop_writing ();
1215     }
1216
1217   return res;
1218 }
1219 #undef FUNC_NAME
1220
1221 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1222             (SCM string),
1223             "Return the number of characters in @var{string}.")
1224 #define FUNC_NAME s_scm_string_length
1225 {
1226   SCM_VALIDATE_STRING (1, string);
1227   return scm_from_size_t (STRING_LENGTH (string));
1228 }
1229 #undef FUNC_NAME
1230
1231 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1232             (SCM string),
1233             "Return the bytes used to represent a character in @var{string}."
1234             "This will return 1 or 4.")
1235 #define FUNC_NAME s_scm_string_bytes_per_char
1236 {
1237   SCM_VALIDATE_STRING (1, string);
1238   if (!scm_i_is_narrow_string (string))
1239     return scm_from_int (4);
1240
1241   return scm_from_int (1);
1242 }
1243 #undef FUNC_NAME
1244
1245 size_t
1246 scm_c_string_length (SCM string)
1247 {
1248   if (!IS_STRING (string))
1249     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1250   return STRING_LENGTH (string);
1251 }
1252
1253 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1254             (SCM str, SCM k),
1255             "Return character @var{k} of @var{str} using zero-origin\n"
1256             "indexing. @var{k} must be a valid index of @var{str}.")
1257 #define FUNC_NAME s_scm_string_ref
1258 {
1259   size_t len;
1260   unsigned long idx;
1261
1262   SCM_VALIDATE_STRING (1, str);
1263
1264   len = scm_i_string_length (str);
1265   if (SCM_LIKELY (len > 0))
1266     idx = scm_to_unsigned_integer (k, 0, len - 1);
1267   else
1268     scm_out_of_range (NULL, k);
1269
1270   if (scm_i_is_narrow_string (str))
1271     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1272   else
1273     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1274 }
1275 #undef FUNC_NAME
1276
1277 SCM
1278 scm_c_string_ref (SCM str, size_t p)
1279 {
1280   if (p >= scm_i_string_length (str))
1281     scm_out_of_range (NULL, scm_from_size_t (p));
1282   if (scm_i_is_narrow_string (str))
1283     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1284   else
1285     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1286
1287 }
1288
1289 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1290             (SCM str, SCM k, SCM chr),
1291             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1292             "an unspecified value. @var{k} must be a valid index of\n"
1293             "@var{str}.")
1294 #define FUNC_NAME s_scm_string_set_x
1295 {
1296   size_t len;
1297   unsigned long idx;
1298
1299   SCM_VALIDATE_STRING (1, str);
1300
1301   len = scm_i_string_length (str);
1302   if (SCM_LIKELY (len > 0))
1303     idx = scm_to_unsigned_integer (k, 0, len - 1);
1304   else
1305     scm_out_of_range (NULL, k);
1306
1307   SCM_VALIDATE_CHAR (3, chr);
1308   str = scm_i_string_start_writing (str);
1309   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1310   scm_i_string_stop_writing ();
1311
1312   return SCM_UNSPECIFIED;
1313 }
1314 #undef FUNC_NAME
1315
1316 void
1317 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1318 {
1319   if (p >= scm_i_string_length (str))
1320     scm_out_of_range (NULL, scm_from_size_t (p));
1321   str = scm_i_string_start_writing (str);
1322   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1323   scm_i_string_stop_writing ();
1324 }
1325
1326 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1327             (SCM str, SCM start, SCM end),
1328             "Return a newly allocated string formed from the characters\n"
1329             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1330             "ending with index @var{end} (exclusive).\n"
1331             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1332             "exact integers satisfying:\n\n"
1333             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1334 #define FUNC_NAME s_scm_substring
1335 {
1336   size_t len, from, to;
1337
1338   SCM_VALIDATE_STRING (1, str);
1339   len = scm_i_string_length (str);
1340   from = scm_to_unsigned_integer (start, 0, len);
1341   if (SCM_UNBNDP (end))
1342     to = len;
1343   else
1344     to = scm_to_unsigned_integer (end, from, len);
1345   return scm_i_substring (str, from, to);
1346 }
1347 #undef FUNC_NAME
1348
1349 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1350             (SCM str, SCM start, SCM end),
1351             "Return a newly allocated string formed from the characters\n"
1352             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1353             "ending with index @var{end} (exclusive).\n"
1354             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1355             "exact integers satisfying:\n"
1356             "\n"
1357             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1358             "\n"
1359             "The returned string is read-only.\n")
1360 #define FUNC_NAME s_scm_substring_read_only
1361 {
1362   size_t len, from, to;
1363
1364   SCM_VALIDATE_STRING (1, str);
1365   len = scm_i_string_length (str);
1366   from = scm_to_unsigned_integer (start, 0, len);
1367   if (SCM_UNBNDP (end))
1368     to = len;
1369   else
1370     to = scm_to_unsigned_integer (end, from, len);
1371   return scm_i_substring_read_only (str, from, to);
1372 }
1373 #undef FUNC_NAME
1374
1375 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1376             (SCM str, SCM start, SCM end),
1377             "Return a newly allocated string formed from the characters\n"
1378             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1379             "ending with index @var{end} (exclusive).\n"
1380             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1381             "exact integers satisfying:\n\n"
1382             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1383 #define FUNC_NAME s_scm_substring_copy
1384 {
1385   /* For the Scheme version, START is mandatory, but for the C
1386      version, it is optional.  See scm_string_copy in srfi-13.c for a
1387      rationale.
1388   */
1389
1390   size_t from, to;
1391
1392   SCM_VALIDATE_STRING (1, str);
1393   scm_i_get_substring_spec (scm_i_string_length (str),
1394                             start, &from, end, &to);
1395   return scm_i_substring_copy (str, from, to);
1396 }
1397 #undef FUNC_NAME
1398
1399 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1400             (SCM str, SCM start, SCM end),
1401             "Return string that indirectly refers to the characters\n"
1402             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1403             "ending with index @var{end} (exclusive).\n"
1404             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1405             "exact integers satisfying:\n\n"
1406             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1407 #define FUNC_NAME s_scm_substring_shared
1408 {
1409   size_t len, from, to;
1410
1411   SCM_VALIDATE_STRING (1, str);
1412   len = scm_i_string_length (str);
1413   from = scm_to_unsigned_integer (start, 0, len);
1414   if (SCM_UNBNDP (end))
1415     to = len;
1416   else
1417     to = scm_to_unsigned_integer (end, from, len);
1418   return scm_i_substring_shared (str, from, to);
1419 }
1420 #undef FUNC_NAME
1421
1422 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1423             (SCM args),
1424             "Return a newly allocated string whose characters form the\n"
1425             "concatenation of the given strings, @var{args}.")
1426 #define FUNC_NAME s_scm_string_append
1427 {
1428   SCM res;
1429   size_t total = 0;
1430   size_t len;
1431   int wide = 0;
1432   SCM l, s;
1433   size_t i;
1434   union
1435   {
1436     char *narrow;
1437     scm_t_wchar *wide;
1438   } data;
1439
1440   SCM_VALIDATE_REST_ARGUMENT (args);
1441   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1442     {
1443       s = SCM_CAR (l);
1444       SCM_VALIDATE_STRING (SCM_ARGn, s);
1445       len = scm_i_string_length (s);
1446       if (((size_t) -1) - total < len)
1447         scm_num_overflow (s_scm_string_append);
1448       total += len;
1449       if (!scm_i_is_narrow_string (s))
1450         wide = 1;
1451     }
1452   data.narrow = NULL;
1453   if (!wide)
1454     res = scm_i_make_string (total, &data.narrow, 0);
1455   else
1456     res = scm_i_make_wide_string (total, &data.wide, 0);
1457
1458   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1459     {
1460       size_t len;
1461       s = SCM_CAR (l);
1462       SCM_VALIDATE_STRING (SCM_ARGn, s);
1463       len = scm_i_string_length (s);
1464       if (len > total)
1465         SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1466       if (!wide)
1467         {
1468           memcpy (data.narrow, scm_i_string_chars (s), len);
1469           data.narrow += len;
1470         }
1471       else
1472         {
1473           if (scm_i_is_narrow_string (s))
1474             {
1475               const char *src = scm_i_string_chars (s);
1476               for (i = 0; i < len; i++)
1477                 data.wide[i] = (unsigned char) src[i];
1478             }
1479           else
1480             u32_cpy ((scm_t_uint32 *) data.wide,
1481                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1482           data.wide += len;
1483         }
1484       total -= len;
1485       scm_remember_upto_here_1 (s);
1486     }
1487   if (total != 0)
1488     SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
1489   return res;
1490 }
1491 #undef FUNC_NAME
1492
1493
1494 \f
1495 /* Charset conversion error handling.  */
1496
1497 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1498 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1499
1500 /* Raise an exception informing that character CHR could not be written
1501    to PORT in its current encoding.  */
1502 void
1503 scm_encoding_error (const char *subr, int err, const char *message,
1504                     SCM port, SCM chr)
1505 {
1506   scm_throw (scm_encoding_error_key,
1507              scm_list_n (scm_from_latin1_string (subr),
1508                          scm_from_latin1_string (message),
1509                          scm_from_int (err),
1510                          port, chr,
1511                          SCM_UNDEFINED));
1512 }
1513
1514 /* Raise an exception informing of an encoding error on PORT.  This
1515    means that a character could not be written in PORT's encoding.  */
1516 void
1517 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1518 {
1519   scm_throw (scm_decoding_error_key,
1520              scm_list_n (scm_from_latin1_string (subr),
1521                          scm_from_latin1_string (message),
1522                          scm_from_int (err),
1523                          port,
1524                          SCM_UNDEFINED));
1525 }
1526
1527 \f
1528 /* String conversion to/from C.  */
1529
1530 static void
1531 decoding_error (const char *func_name, int errno_save,
1532                 const char *str, size_t len)
1533 {
1534   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1535      handler.  */
1536   SCM bv;
1537   signed char *buf;
1538
1539   buf = scm_gc_malloc_pointerless (len, "bytevector");
1540   memcpy (buf, str, len);
1541   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1542
1543   scm_decoding_error (func_name, errno_save,
1544                       "input locale conversion error", bv);
1545 }
1546
1547 SCM
1548 scm_from_stringn (const char *str, size_t len, const char *encoding,
1549                   scm_t_string_failed_conversion_handler handler)
1550 {
1551   size_t u32len, i;
1552   scm_t_wchar *u32;
1553   int wide = 0;
1554   SCM res;
1555
1556   /* The order of these checks is important. */
1557   if (!str && len != 0)
1558     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1559   if (len == (size_t) -1)
1560     len = strlen (str);
1561
1562   if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
1563     return scm_from_latin1_stringn (str, len);
1564   else if (c_strcasecmp (encoding, "UTF-8") == 0
1565            && handler == SCM_FAILED_CONVERSION_ERROR)
1566     return scm_from_utf8_stringn (str, len);
1567
1568   u32len = 0;
1569   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1570                                                 (enum iconv_ilseq_handler)
1571                                                 handler,
1572                                                 str, len,
1573                                                 NULL,
1574                                                 NULL, &u32len);
1575
1576   if (SCM_UNLIKELY (u32 == NULL))
1577     decoding_error (__func__, errno, str, len);
1578
1579   i = 0;
1580   while (i < u32len)
1581     if (u32[i++] > 0xFF)
1582       {
1583         wide = 1;
1584         break;
1585       }
1586
1587   if (!wide)
1588     {
1589       char *dst;
1590       res = scm_i_make_string (u32len, &dst, 0);
1591       for (i = 0; i < u32len; i ++)
1592         dst[i] = (unsigned char) u32[i];
1593       dst[u32len] = '\0';
1594     }
1595   else
1596     {
1597       scm_t_wchar *wdst;
1598       res = scm_i_make_wide_string (u32len, &wdst, 0);
1599       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1600       wdst[u32len] = 0;
1601     }
1602
1603   free (u32);
1604   return res;
1605 }
1606
1607 SCM
1608 scm_from_locale_string (const char *str)
1609 {
1610   return scm_from_locale_stringn (str, -1);
1611 }
1612
1613 SCM
1614 scm_from_locale_stringn (const char *str, size_t len)
1615 {
1616   return scm_from_stringn (str, len, locale_charset (),
1617                            scm_i_default_port_conversion_handler ());
1618 }
1619
1620 SCM
1621 scm_from_latin1_string (const char *str)
1622 {
1623   return scm_from_latin1_stringn (str, -1);
1624 }
1625
1626 SCM
1627 scm_from_latin1_stringn (const char *str, size_t len)
1628 {
1629   char *buf;
1630   SCM result;
1631
1632   if (len == (size_t) -1)
1633     len = strlen (str);
1634
1635   /* Make a narrow string and copy STR as is.  */
1636   result = scm_i_make_string (len, &buf, 0);
1637   memcpy (buf, str, len);
1638
1639   return result;
1640 }
1641
1642 SCM
1643 scm_from_utf8_string (const char *str)
1644 {
1645   return scm_from_utf8_stringn (str, -1);
1646 }
1647
1648 SCM
1649 scm_from_utf8_stringn (const char *str, size_t len)
1650 {
1651   size_t i, char_len;
1652   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1653   int ascii = 1, narrow = 1;
1654   SCM res;
1655
1656   if (len == (size_t) -1)
1657     len = strlen (str);
1658
1659   i = 0;
1660   char_len = 0;
1661
1662   while (i < len)
1663     {
1664       if (ustr[i] <= 127)
1665         {
1666           char_len++;
1667           i++;
1668         }
1669       else
1670         {
1671           ucs4_t c;
1672           int nbytes;
1673
1674           ascii = 0;
1675
1676           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1677
1678           if (c == 0xfffd)
1679             /* Bad UTF-8.  */
1680             decoding_error (__func__, errno, str, len);
1681
1682           if (c > 255)
1683             narrow = 0;
1684
1685           char_len++;
1686           i += nbytes;
1687         }
1688     }
1689
1690   if (ascii)
1691     {
1692       char *dst;
1693       res = scm_i_make_string (char_len, &dst, 0);
1694       memcpy (dst, str, len);
1695     }
1696   else if (narrow)
1697     {
1698       char *dst;
1699       size_t j;
1700       ucs4_t c;
1701
1702       res = scm_i_make_string (char_len, &dst, 0);
1703
1704       for (i = 0, j = 0; i < len; j++)
1705         {
1706           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1707           dst[j] = (signed char) c;
1708         }
1709     }
1710   else
1711     {
1712       scm_t_wchar *dst;
1713       size_t j;
1714       ucs4_t c;
1715
1716       res = scm_i_make_wide_string (char_len, &dst, 0);
1717
1718       for (i = 0, j = 0; i < len; j++)
1719         {
1720           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1721           dst[j] = c;
1722         }
1723     }
1724
1725   return res;
1726 }
1727
1728 SCM
1729 scm_from_utf32_string (const scm_t_wchar *str)
1730 {
1731   return scm_from_utf32_stringn (str, -1);
1732 }
1733
1734 SCM
1735 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1736 {
1737   SCM result;
1738   scm_t_wchar *buf;
1739
1740   if (len == (size_t) -1)
1741     len = u32_strlen ((uint32_t *) str);
1742
1743   result = scm_i_make_wide_string (len, &buf, 0);
1744   memcpy (buf, str, len * sizeof (scm_t_wchar));
1745   scm_i_try_narrow_string (result);
1746
1747   return result;
1748 }
1749
1750 SCM
1751 scm_from_port_string (const char *str, SCM port)
1752 {
1753   return scm_from_port_stringn (str, -1, port);
1754 }
1755
1756 SCM
1757 scm_from_port_stringn (const char *str, size_t len, SCM port)
1758 {
1759   scm_t_port *pt = SCM_PTAB_ENTRY (port);
1760   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
1761
1762   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1763     return scm_from_latin1_stringn (str, len);
1764   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1765            && (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
1766                || (u8_check ((uint8_t *) str, len) == NULL)))
1767     return scm_from_utf8_stringn (str, len);
1768   else
1769     return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1770 }
1771
1772 /* Create a new scheme string from the C string STR.  The memory of
1773    STR may be used directly as storage for the new string.  */
1774 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1775    would be to register a finalizer to eventually free(3) STR, which isn't
1776    worth it.  Should we just deprecate the `scm_take_' functions?  */
1777 SCM
1778 scm_take_locale_stringn (char *str, size_t len)
1779 {
1780   SCM res;
1781
1782   res = scm_from_locale_stringn (str, len);
1783   free (str);
1784
1785   return res;
1786 }
1787
1788 SCM
1789 scm_take_locale_string (char *str)
1790 {
1791   return scm_take_locale_stringn (str, -1);
1792 }
1793
1794 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1795    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1796    Set *LENP to the size of the resulting string.
1797
1798    FIXME: This is a hack we should get rid of.  See
1799    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1800    for details.  */
1801 static void
1802 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1803 {
1804   char *before, *after;
1805   size_t i, j;
1806
1807   before = buf;
1808   after = buf;
1809   i = 0;
1810   j = 0;
1811   while (i < *lenp)
1812     {
1813       if ((i <= *lenp - 6)
1814           && before[i] == '\\'
1815           && before[i + 1] == 'u'
1816           && before[i + 2] == '0' && before[i + 3] == '0')
1817         {
1818           /* Convert \u00NN to \xNN */
1819           after[j] = '\\';
1820           after[j + 1] = 'x';
1821           after[j + 2] = tolower ((int) before[i + 4]);
1822           after[j + 3] = tolower ((int) before[i + 5]);
1823           i += 6;
1824           j += 4;
1825         }
1826       else if ((i <= *lenp - 10)
1827                && before[i] == '\\'
1828                && before[i + 1] == 'U'
1829                && before[i + 2] == '0' && before[i + 3] == '0')
1830         {
1831           /* Convert \U00NNNNNN to \UNNNNNN */
1832           after[j] = '\\';
1833           after[j + 1] = 'U';
1834           after[j + 2] = tolower ((int) before[i + 4]);
1835           after[j + 3] = tolower ((int) before[i + 5]);
1836           after[j + 4] = tolower ((int) before[i + 6]);
1837           after[j + 5] = tolower ((int) before[i + 7]);
1838           after[j + 6] = tolower ((int) before[i + 8]);
1839           after[j + 7] = tolower ((int) before[i + 9]);
1840           i += 10;
1841           j += 8;
1842         }
1843       else
1844         {
1845           after[j] = before[i];
1846           i++;
1847           j++;
1848         }
1849     }
1850   *lenp = j;
1851 }
1852
1853 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1854    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1855    of the resulting string.  BUF must be large enough to handle the
1856    worst case when `\uXXXX' escapes (6 characters) are replaced by
1857    `\xXXXX;' (7 characters).  */
1858 static void
1859 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1860 {
1861   char *before, *after;
1862   size_t i, j;
1863   /* The worst case is if the input string contains all 4-digit hex escapes.
1864      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1865   size_t max_out_len = (*lenp * 7) / 6 + 1;
1866   size_t nzeros, ndigits;
1867
1868   before = buf;
1869   after = alloca (max_out_len);
1870   i = 0;
1871   j = 0;
1872   while (i < *lenp)
1873     {
1874       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1875           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1876         {
1877           if (before[i + 1] == 'u')
1878             ndigits = 4;
1879           else if (before[i + 1] == 'U')
1880             ndigits = 8;
1881           else
1882             abort ();
1883
1884           /* Add the R6RS hex escape initial sequence.  */
1885           after[j] = '\\';
1886           after[j + 1] = 'x';
1887
1888           /* Move string positions to the start of the hex numbers.  */
1889           i += 2;
1890           j += 2;
1891
1892           /* Find the number of initial zeros in this hex number.  */
1893           nzeros = 0;
1894           while (before[i + nzeros] == '0' && nzeros < ndigits)
1895             nzeros++;
1896
1897           /* Copy the number, skipping initial zeros, and then move the string
1898              positions.  */
1899           if (nzeros == ndigits)
1900             {
1901               after[j] = '0';
1902               i += ndigits;
1903               j += 1;
1904             }
1905           else
1906             {
1907               int pos;
1908               for (pos = 0; pos < ndigits - nzeros; pos++)
1909                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1910               i += ndigits;
1911               j += (ndigits - nzeros);
1912             }
1913
1914           /* Add terminating semicolon.  */
1915           after[j] = ';';
1916           j++;
1917         }
1918       else
1919         {
1920           after[j] = before[i];
1921           i++;
1922           j++;
1923         }
1924     }
1925   *lenp = j;
1926   memcpy (before, after, j);
1927 }
1928
1929 char *
1930 scm_to_locale_string (SCM str)
1931 {
1932   return scm_to_locale_stringn (str, NULL);
1933 }
1934
1935 char *
1936 scm_to_locale_stringn (SCM str, size_t *lenp)
1937 {
1938   return scm_to_stringn (str, lenp,
1939                          locale_charset (),
1940                          scm_i_default_port_conversion_handler ());
1941 }
1942
1943 char *
1944 scm_to_latin1_string (SCM str)
1945 {
1946   return scm_to_latin1_stringn (str, NULL);
1947 }
1948
1949 char *
1950 scm_to_latin1_stringn (SCM str, size_t *lenp)
1951 #define FUNC_NAME "scm_to_latin1_stringn"
1952 {
1953   char *result;
1954
1955   SCM_VALIDATE_STRING (1, str);
1956
1957   if (scm_i_is_narrow_string (str))
1958     {
1959       size_t len = scm_i_string_length (str);
1960
1961       if (lenp)
1962         *lenp = len;
1963
1964       result = scm_strndup (scm_i_string_data (str), len);
1965     }
1966   else
1967     result = scm_to_stringn (str, lenp, NULL,
1968                              SCM_FAILED_CONVERSION_ERROR);
1969
1970   return result;
1971 }
1972 #undef FUNC_NAME
1973
1974 char *
1975 scm_to_utf8_string (SCM str)
1976 {
1977   return scm_to_utf8_stringn (str, NULL);
1978 }
1979
1980 static size_t
1981 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1982 {
1983   size_t ret, i;
1984   for (i = 0, ret = 0; i < len; i++)
1985     ret += (str[i] < 128) ? 1 : 2;
1986   return ret;
1987 }
1988
1989 static scm_t_uint8*
1990 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1991               scm_t_uint8 *u8_result, size_t *u8_lenp)
1992 {
1993   size_t i, n;
1994   size_t u8_len = latin1_u8_strlen (str, latin_len);
1995
1996   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1997     u8_result = scm_malloc (u8_len + 1);
1998   if (u8_lenp)
1999     *u8_lenp = u8_len;
2000
2001   for (i = 0, n = 0; i < latin_len; i++)
2002     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
2003   if (n != u8_len)
2004     abort ();
2005   u8_result[n] = 0;
2006
2007   return u8_result;
2008 }
2009
2010 /* UTF-8 code table
2011
2012    (Note that this includes code points that are not allowed by Unicode,
2013     but since this function has no way to report an error, and its
2014     purpose is to determine the size of destination buffers for
2015     libunicode conversion functions, we err on the safe side and handle
2016     everything that libunicode might conceivably handle, now or in the
2017     future.)
2018
2019    Char. number range  |        UTF-8 octet sequence
2020       (hexadecimal)    |              (binary)
2021    --------------------+------------------------------------------------------
2022    0000 0000-0000 007F | 0xxxxxxx
2023    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2024    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
2025    0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2026    0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2027    0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2028 */
2029
2030 static size_t
2031 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2032 {
2033   size_t ret, i;
2034
2035   for (i = 0, ret = 0; i < len; i++)
2036     {
2037       scm_t_uint32 c = str[i];
2038
2039       if (c <= 0x7f)
2040         ret += 1;
2041       else if (c <= 0x7ff)
2042         ret += 2;
2043       else if (c <= 0xffff)
2044         ret += 3;
2045       else if (c <= 0x1fffff)
2046         ret += 4;
2047       else if (c <= 0x3ffffff)
2048         ret += 5;
2049       else
2050         ret += 6;
2051     }
2052
2053   return ret;
2054 }
2055
2056 char *
2057 scm_to_utf8_stringn (SCM str, size_t *lenp)
2058 #define FUNC_NAME "scm_to_utf8_stringn"
2059 {
2060   SCM_VALIDATE_STRING (1, str);
2061
2062   if (scm_i_is_narrow_string (str))
2063     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2064                                   scm_i_string_length (str),
2065                                   NULL, lenp);
2066   else
2067     {
2068       scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2069       scm_t_uint8 *buf, *ret;
2070       size_t num_chars = scm_i_string_length (str);
2071       size_t num_bytes_predicted, num_bytes_actual;
2072
2073       num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2074
2075       if (lenp)
2076         {
2077           *lenp = num_bytes_predicted;
2078           buf = scm_malloc (num_bytes_predicted);
2079         }
2080       else
2081         {
2082           buf = scm_malloc (num_bytes_predicted + 1);
2083           buf[num_bytes_predicted] = 0;
2084         }
2085
2086       num_bytes_actual = num_bytes_predicted;
2087       ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2088
2089       if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2090         return (char *) ret;
2091
2092       /* An error: a bad codepoint.  */
2093       {
2094         int saved_errno = errno;
2095
2096         free (buf);
2097         if (!saved_errno)
2098           abort ();
2099
2100         scm_decoding_error ("scm_to_utf8_stringn", errno,
2101                             "invalid codepoint in string", str);
2102
2103         /* Not reached.  */
2104         return NULL;
2105       }
2106     }
2107 }
2108 #undef FUNC_NAME
2109
2110 scm_t_wchar *
2111 scm_to_utf32_string (SCM str)
2112 {
2113   return scm_to_utf32_stringn (str, NULL);
2114 }
2115
2116 scm_t_wchar *
2117 scm_to_utf32_stringn (SCM str, size_t *lenp)
2118 #define FUNC_NAME "scm_to_utf32_stringn"
2119 {
2120   scm_t_wchar *result;
2121
2122   SCM_VALIDATE_STRING (1, str);
2123
2124   if (scm_i_is_narrow_string (str))
2125     {
2126       scm_t_uint8 *codepoints;
2127       size_t i, len;
2128
2129       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2130       len = scm_i_string_length (str);
2131       if (lenp)
2132         *lenp = len;
2133
2134       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2135       for (i = 0; i < len; i++)
2136         result[i] = codepoints[i];
2137       result[len] = 0;
2138     }
2139   else
2140     {
2141       size_t len;
2142
2143       len = scm_i_string_length (str);
2144       if (lenp)
2145         *lenp = len;
2146
2147       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2148       memcpy (result, scm_i_string_wide_chars (str),
2149               len * sizeof (scm_t_wchar));
2150       result[len] = 0;
2151     }
2152
2153   return result;
2154 }
2155 #undef FUNC_NAME
2156
2157 char *
2158 scm_to_port_string (SCM str, SCM port)
2159 {
2160   return scm_to_port_stringn (str, NULL, port);
2161 }
2162
2163 char *
2164 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2165 {
2166   scm_t_port *pt = SCM_PTAB_ENTRY (port);
2167   scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
2168
2169   if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2170       && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2171     return scm_to_latin1_stringn (str, lenp);
2172   else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2173     return scm_to_utf8_stringn (str, lenp);
2174   else
2175     return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2176 }
2177
2178 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2179    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2180    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2181    defined by HANDLER.  */
2182 char *
2183 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2184                 scm_t_string_failed_conversion_handler handler)
2185 {
2186   char *buf;
2187   size_t ilen, len, i;
2188   int ret;
2189   const char *enc;
2190
2191   if (!scm_is_string (str))
2192     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2193   ilen = scm_i_string_length (str);
2194
2195   if (ilen == 0)
2196     {
2197       buf = scm_malloc (1);
2198       buf[0] = '\0';
2199       if (lenp)
2200         *lenp = 0;
2201       return buf;
2202     }
2203
2204   if (lenp == NULL)
2205     for (i = 0; i < ilen; i++)
2206       if (scm_i_string_ref (str, i) == '\0')
2207         scm_misc_error (NULL,
2208                         "string contains #\\nul character: ~S",
2209                         scm_list_1 (str));
2210
2211   if (scm_i_is_narrow_string (str)
2212       && c_strcasecmp (encoding, "ISO-8859-1") == 0)
2213     {
2214       /* If using native Latin-1 encoding, just copy the string
2215          contents.  */
2216       if (lenp)
2217         {
2218           buf = scm_malloc (ilen);
2219           memcpy (buf, scm_i_string_chars (str), ilen);
2220           *lenp = ilen;
2221           return buf;
2222         }
2223       else
2224         {
2225           buf = scm_malloc (ilen + 1);
2226           memcpy (buf, scm_i_string_chars (str), ilen);
2227           buf[ilen] = '\0';
2228           return buf;
2229         }
2230     }
2231
2232
2233   buf = NULL;
2234   len = 0;
2235   enc = encoding;
2236   if (enc == NULL)
2237     enc = "ISO-8859-1";
2238   if (scm_i_is_narrow_string (str))
2239     {
2240       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2241                          "ISO-8859-1", enc,
2242                          (enum iconv_ilseq_handler) handler, NULL,
2243                          &buf, &len);
2244
2245       if (ret != 0)
2246         scm_encoding_error (__func__, errno,
2247                             "cannot convert narrow string to output locale",
2248                             SCM_BOOL_F,
2249                             /* FIXME: Faulty character unknown.  */
2250                             SCM_BOOL_F);
2251     }
2252   else
2253     {
2254       buf = u32_conv_to_encoding (enc,
2255                                   (enum iconv_ilseq_handler) handler,
2256                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2257                                   ilen,
2258                                   NULL,
2259                                   NULL, &len);
2260       if (buf == NULL)
2261         scm_encoding_error (__func__, errno,
2262                             "cannot convert wide string to output locale",
2263                             SCM_BOOL_F,
2264                             /* FIXME: Faulty character unknown.  */
2265                             SCM_BOOL_F);
2266     }
2267   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2268     {
2269       if (SCM_R6RS_ESCAPES_P)
2270         {
2271           /* The worst case is if the input string contains all 4-digit
2272              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2273              (seven characters).  Make BUF large enough to hold
2274              that.  */
2275           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2276           unistring_escapes_to_r6rs_escapes (buf, &len);
2277         }
2278       else
2279         unistring_escapes_to_guile_escapes (buf, &len);
2280
2281       buf = scm_realloc (buf, len);
2282     }
2283   if (lenp)
2284     *lenp = len;
2285   else
2286     {
2287       buf = scm_realloc (buf, len + 1);
2288       buf[len] = '\0';
2289     }
2290
2291   scm_remember_upto_here_1 (str);
2292   return buf;
2293 }
2294
2295 size_t
2296 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2297 {
2298   size_t len;
2299   char *result = NULL;
2300   if (!scm_is_string (str))
2301     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2302   result = scm_to_locale_stringn (str, &len);
2303
2304   memcpy (buf, result, (len > max_len) ? max_len : len);
2305   free (result);
2306
2307   scm_remember_upto_here_1 (str);
2308   return len;
2309 }
2310
2311 \f
2312 /* Unicode string normalization.  */
2313
2314 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2315    libguile/i18n.c.  It would be useful to have this factored out into a more
2316    convenient location, but its use of alloca makes that tricky to do. */
2317
2318 static SCM
2319 normalize_str (SCM string, uninorm_t form)
2320 {
2321   SCM ret;
2322   scm_t_uint32 *w_str;
2323   scm_t_wchar *cbuf;
2324   size_t rlen, len = scm_i_string_length (string);
2325
2326   if (scm_i_is_narrow_string (string))
2327     {
2328       size_t i;
2329       const char *buf = scm_i_string_chars (string);
2330
2331       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2332
2333       for (i = 0; i < len; i ++)
2334         w_str[i] = (unsigned char) buf[i];
2335       w_str[len] = 0;
2336     }
2337   else
2338     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2339
2340   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2341
2342   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2343   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2344   free (w_str);
2345
2346   scm_i_try_narrow_string (ret);
2347
2348   return ret;
2349 }
2350
2351 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2352             (SCM string),
2353             "Returns the NFC normalized form of @var{string}.")
2354 #define FUNC_NAME s_scm_string_normalize_nfc
2355 {
2356   SCM_VALIDATE_STRING (1, string);
2357   return normalize_str (string, UNINORM_NFC);
2358 }
2359 #undef FUNC_NAME
2360
2361 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2362             (SCM string),
2363             "Returns the NFD normalized form of @var{string}.")
2364 #define FUNC_NAME s_scm_string_normalize_nfd
2365 {
2366   SCM_VALIDATE_STRING (1, string);
2367   return normalize_str (string, UNINORM_NFD);
2368 }
2369 #undef FUNC_NAME
2370
2371 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2372             (SCM string),
2373             "Returns the NFKC normalized form of @var{string}.")
2374 #define FUNC_NAME s_scm_string_normalize_nfkc
2375 {
2376   SCM_VALIDATE_STRING (1, string);
2377   return normalize_str (string, UNINORM_NFKC);
2378 }
2379 #undef FUNC_NAME
2380
2381 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2382             (SCM string),
2383             "Returns the NFKD normalized form of @var{string}.")
2384 #define FUNC_NAME s_scm_string_normalize_nfkd
2385 {
2386   SCM_VALIDATE_STRING (1, string);
2387   return normalize_str (string, UNINORM_NFKD);
2388 }
2389 #undef FUNC_NAME
2390
2391 /* converts C scm_array of strings to SCM scm_list of strings.
2392    If argc < 0, a null terminated scm_array is assumed.
2393    The current locale encoding is assumed */
2394 SCM
2395 scm_makfromstrs (int argc, char **argv)
2396 {
2397   int i = argc;
2398   SCM lst = SCM_EOL;
2399   if (0 > i)
2400     for (i = 0; argv[i]; i++);
2401   while (i--)
2402     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2403   return lst;
2404 }
2405
2406 /* Return a newly allocated array of char pointers to each of the strings
2407    in args, with a terminating NULL pointer.  The strings are encoded using
2408    the current locale. */
2409
2410 char **
2411 scm_i_allocate_string_pointers (SCM list)
2412 #define FUNC_NAME "scm_i_allocate_string_pointers"
2413 {
2414   char **result;
2415   int list_len = scm_ilength (list);
2416   int i;
2417
2418   if (list_len < 0)
2419     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2420
2421   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2422                           "string pointers");
2423   result[list_len] = NULL;
2424
2425   /* The list might have been modified in another thread, so
2426      we check LIST before each access.
2427    */
2428   for (i = 0; i < list_len && scm_is_pair (list); i++)
2429     {
2430       SCM str = SCM_CAR (list);
2431       size_t len;  /* String length in bytes */
2432       char *c_str = scm_to_locale_stringn (str, &len);
2433
2434       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2435          scm_malloc to allocate the returned string, which must be
2436          explicitly deallocated.  This forces us to copy the string a
2437          second time into a new buffer.  Ideally there would be variants
2438          of scm_to_*_stringn that can return garbage-collected buffers. */
2439
2440       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2441       memcpy (result[i], c_str, len);
2442       result[i][len] = '\0';
2443       free (c_str);
2444
2445       list = SCM_CDR (list);
2446     }
2447
2448   return result;
2449 }
2450 #undef FUNC_NAME
2451
2452 void
2453 scm_i_get_substring_spec (size_t len,
2454                           SCM start, size_t *cstart,
2455                           SCM end, size_t *cend)
2456 {
2457   if (SCM_UNBNDP (start))
2458     *cstart = 0;
2459   else
2460     *cstart = scm_to_unsigned_integer (start, 0, len);
2461
2462   if (SCM_UNBNDP (end))
2463     *cend = len;
2464   else
2465     *cend = scm_to_unsigned_integer (end, *cstart, len);
2466 }
2467
2468 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2469
2470 void
2471 scm_init_strings ()
2472 {
2473   scm_nullstr = scm_i_make_string (0, NULL, 0);
2474
2475 #include "libguile/strings.x"
2476 }
2477
2478
2479 /*
2480   Local Variables:
2481   c-file-style: "gnu"
2482   End:
2483 */