libguile/strings.c

   1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
   2  *
   3  * This library is free software; you can redistribute it and/or
   4  * modify it under the terms of the GNU Lesser General Public License
   5  * as published by the Free Software Foundation; either version 3 of
   6  * the License, or (at your option) any later version.
   7  *
   8  * This library is distributed in the hope that it will be useful, but
   9  * WITHOUT ANY WARRANTY; without even the implied warranty of
  10  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  11  * Lesser General Public License for more details.
  12  *
  13  * You should have received a copy of the GNU Lesser General Public
  14  * License along with this library; if not, write to the Free Software
  15  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
  16  * 02110-1301 USA
  17  */
  18
  19
  20 \f
  21 #ifdef HAVE_CONFIG_H
  22 # include <config.h>
  23 #endif
  24
  25 #include <alloca.h>
  26 #include <string.h>
  27 #include <stdio.h>
  28 #include <ctype.h>
  29 #include <uninorm.h>
  30 #include <unistr.h>
  31 #include <uniconv.h>
  32
  33 #include "striconveh.h"
  34
  35 #include "libguile/_scm.h"
  36 #include "libguile/chars.h"
  37 #include "libguile/root.h"
  38 #include "libguile/strings.h"
  39 #include "libguile/error.h"
  40 #include "libguile/generalized-vectors.h"
  41 #include "libguile/deprecation.h"
  42 #include "libguile/validate.h"
  43 #include "libguile/private-options.h"
  44
  45 \f
  46
  47 /* {Strings}
  48  */
  49
  50
  51 /* Stringbufs
  52  *
  53  * XXX - keeping an accurate refcount during GC seems to be quite
  54  * tricky, so we just keep score of whether a stringbuf might be
  55  * shared, not whether it definitely is.
  56  *
  57  * The scheme I (mvo) tried to keep an accurate reference count would
  58  * recount all strings that point to a stringbuf during the mark-phase
  59  * of the GC.  This was done since one cannot access the stringbuf of
  60  * a string when that string is freed (in order to decrease the
  61  * reference count).  The memory of the stringbuf might have been
  62  * reused already for something completely different.
  63  *
  64  * This recounted worked for a small number of threads beating on
  65  * cow-strings, but it failed randomly with more than 10 threads, say.
  66  * I couldn't figure out what went wrong, so I used the conservative
  67  * approach implemented below.
  68  *
  69  * There are 2 storage strategies for stringbufs: 8-bit and wide.  8-bit
  70  * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
  71  * strings.
  72  */
  73
  74 /* The size in words of the stringbuf header (type tag + size).  */
  75 #define STRINGBUF_HEADER_SIZE   2U
  76
  77 #define STRINGBUF_HEADER_BYTES  (STRINGBUF_HEADER_SIZE * sizeof (SCM))
  78
  79 #define STRINGBUF_F_SHARED      SCM_I_STRINGBUF_F_SHARED
  80 #define STRINGBUF_F_WIDE        SCM_I_STRINGBUF_F_WIDE
  81
  82 #define STRINGBUF_TAG           scm_tc7_stringbuf
  83 #define STRINGBUF_SHARED(buf)   (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
  84 #define STRINGBUF_WIDE(buf)     (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
  85
  86 #define STRINGBUF_CONTENTS(buf) ((void *)                               \
  87                                  SCM_CELL_OBJECT_LOC (buf,              \
  88                                                       STRINGBUF_HEADER_SIZE))
  89 #define STRINGBUF_CHARS(buf)    ((unsigned char *) STRINGBUF_CONTENTS (buf))
  90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
  91
  92 #define STRINGBUF_LENGTH(buf)   (SCM_CELL_WORD_1 (buf))
  93
  94 #define SET_STRINGBUF_SHARED(buf)                                       \
  95   do                                                                    \
  96     {                                                                   \
  97       /* Don't modify BUF if it's already marked as shared since it might be \
  98          a read-only, statically allocated stringbuf.  */               \
  99       if (SCM_LIKELY (!STRINGBUF_SHARED (buf)))                         \
 100         SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
 101     }                                                                   \
 102   while (0)
 103
 104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 105 static size_t lenhist[1001];
 106 #endif
 107
 108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
 109    characters. */
 110 static SCM
 111 make_stringbuf (size_t len)
 112 {
 113   /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
 114      scm_i_symbol_chars, all stringbufs are null-terminated.  Once
 115      SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
 116      has been changed for scm_i_symbol_chars, this null-termination
 117      can be dropped.
 118   */
 119
 120   SCM buf;
 121
 122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 123   if (len < 1000)
 124     lenhist[len]++;
 125   else
 126     lenhist[1000]++;
 127 #endif
 128
 129   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
 130                                             "string"));
 131
 132   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
 133   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 134
 135   STRINGBUF_CHARS (buf)[len] = 0;
 136
 137   return buf;
 138 }
 139
 140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
 141    characters.  */
 142 static SCM
 143 make_wide_stringbuf (size_t len)
 144 {
 145   SCM buf;
 146   size_t raw_len;
 147
 148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
 149   if (len < 1000)
 150     lenhist[len]++;
 151   else
 152     lenhist[1000]++;
 153 #endif
 154
 155   raw_len = (len + 1) * sizeof (scm_t_wchar);
 156   buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
 157                                             "string"));
 158
 159   SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
 160   SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
 161
 162   STRINGBUF_WIDE_CHARS (buf)[len] = 0;
 163
 164   return buf;
 165 }
 166
 167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
 168    characters from BUF.  */
 169 static SCM
 170 wide_stringbuf (SCM buf)
 171 {
 172   SCM new_buf;
 173
 174   if (STRINGBUF_WIDE (buf))
 175     new_buf = buf;
 176   else
 177     {
 178       size_t i, len;
 179       scm_t_wchar *mem;
 180
 181       len = STRINGBUF_LENGTH (buf);
 182
 183       new_buf = make_wide_stringbuf (len);
 184
 185       mem = STRINGBUF_WIDE_CHARS (new_buf);
 186       for (i = 0; i < len; i++)
 187         mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
 188       mem[len] = 0;
 189     }
 190
 191   return new_buf;
 192 }
 193
 194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
 195    characters from BUF, if possible.  */
 196 static SCM
 197 narrow_stringbuf (SCM buf)
 198 {
 199   SCM new_buf;
 200
 201   if (!STRINGBUF_WIDE (buf))
 202     new_buf = buf;
 203   else
 204     {
 205       size_t i, len;
 206       scm_t_wchar *wmem;
 207       unsigned char *mem;
 208
 209       len = STRINGBUF_LENGTH (buf);
 210       wmem = STRINGBUF_WIDE_CHARS (buf);
 211
 212       for (i = 0; i < len; i++)
 213         if (wmem[i] > 0xFF)
 214           /* BUF cannot be narrowed.  */
 215           return buf;
 216
 217       new_buf = make_stringbuf (len);
 218
 219       mem = STRINGBUF_CHARS (new_buf);
 220       for (i = 0; i < len; i++)
 221         mem[i] = (unsigned char) wmem[i];
 222       mem[len] = 0;
 223     }
 224
 225   return new_buf;
 226 }
 227
 228 scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
 229
 230 \f
 231 /* Copy-on-write strings.
 232  */
 233
 234 #define STRING_TAG            scm_tc7_string
 235
 236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
 237 #define STRING_START(str)     ((size_t)SCM_CELL_WORD_2(str))
 238 #define STRING_LENGTH(str)    ((size_t)SCM_CELL_WORD_3(str))
 239
 240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
 241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
 242
 243 #define IS_STRING(str)        (SCM_HAS_TYP7 (str, STRING_TAG))
 244
 245 /* Read-only strings.
 246  */
 247
 248 #define RO_STRING_TAG         scm_tc7_ro_string
 249 #define IS_RO_STRING(str)     (SCM_CELL_TYPE(str)==RO_STRING_TAG)
 250
 251 /* Mutation-sharing substrings
 252  */
 253
 254 #define SH_STRING_TAG       (scm_tc7_string + 0x100)
 255
 256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
 257 /* START and LENGTH as for STRINGs. */
 258
 259 #define IS_SH_STRING(str)   (SCM_CELL_TYPE(str)==SH_STRING_TAG)
 260
 261 SCM scm_nullstr;
 262
 263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
 264    characters.  CHARSP, if not NULL, will be set to location of the
 265    char array.  If READ_ONLY_P, the returned string is read-only;
 266    otherwise it is writable.  */
 267 SCM
 268 scm_i_make_string (size_t len, char **charsp, int read_only_p)
 269 {
 270   static SCM null_stringbuf = SCM_BOOL_F;
 271   SCM buf;
 272   SCM res;
 273
 274   if (len == 0)
 275     {
 276       if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
 277         {
 278           null_stringbuf = make_stringbuf (0);
 279           SET_STRINGBUF_SHARED (null_stringbuf);
 280         }
 281       buf = null_stringbuf;
 282     }
 283   else
 284     buf = make_stringbuf (len);
 285
 286   if (charsp)
 287     *charsp = (char *) STRINGBUF_CHARS (buf);
 288   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 289                          SCM_UNPACK (buf),
 290                          (scm_t_bits) 0, (scm_t_bits) len);
 291   return res;
 292 }
 293
 294 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
 295    characters.  CHARSP, if not NULL, will be set to location of the
 296    character array.  If READ_ONLY_P, the returned string is read-only;
 297    otherwise it is writable.  */
 298 SCM
 299 scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
 300 {
 301   SCM buf = make_wide_stringbuf (len);
 302   SCM res;
 303   if (charsp)
 304     *charsp = STRINGBUF_WIDE_CHARS (buf);
 305   res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
 306                          SCM_UNPACK (buf),
 307                          (scm_t_bits) 0, (scm_t_bits) len);
 308   return res;
 309 }
 310
 311 static void
 312 validate_substring_args (SCM str, size_t start, size_t end)
 313 {
 314   if (!IS_STRING (str))
 315     scm_wrong_type_arg_msg (NULL, 0, str, "string");
 316   if (start > STRING_LENGTH (str))
 317     scm_out_of_range (NULL, scm_from_size_t (start));
 318   if (end > STRING_LENGTH (str) || end < start)
 319     scm_out_of_range (NULL, scm_from_size_t (end));
 320 }
 321
 322 static inline void
 323 get_str_buf_start (SCM *str, SCM *buf, size_t *start)
 324 {
 325   *start = STRING_START (*str);
 326   if (IS_SH_STRING (*str))
 327     {
 328       *str = SH_STRING_STRING (*str);
 329       *start += STRING_START (*str);
 330     }
 331   *buf = STRING_STRINGBUF (*str);
 332 }
 333
 334 SCM
 335 scm_i_substring (SCM str, size_t start, size_t end)
 336 {
 337   if (start == end)
 338     return scm_i_make_string (0, NULL, 0);
 339   else
 340     {
 341       SCM buf;
 342       size_t str_start;
 343       get_str_buf_start (&str, &buf, &str_start);
 344       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 345       SET_STRINGBUF_SHARED (buf);
 346       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 347       return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
 348                               (scm_t_bits)str_start + start,
 349                               (scm_t_bits) end - start);
 350     }
 351 }
 352
 353 SCM
 354 scm_i_substring_read_only (SCM str, size_t start, size_t end)
 355 {
 356   if (start == end)
 357     return scm_i_make_string (0, NULL, 1);
 358   else
 359     {
 360       SCM buf;
 361       size_t str_start;
 362       get_str_buf_start (&str, &buf, &str_start);
 363       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 364       SET_STRINGBUF_SHARED (buf);
 365       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 366       return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
 367                               (scm_t_bits)str_start + start,
 368                               (scm_t_bits) end - start);
 369     }
 370 }
 371
 372 SCM
 373 scm_i_substring_copy (SCM str, size_t start, size_t end)
 374 {
 375   if (start == end)
 376     return scm_i_make_string (0, NULL, 0);
 377   else
 378     {
 379       size_t len = end - start;
 380       SCM buf, my_buf, substr;
 381       size_t str_start;
 382       int wide = 0;
 383       get_str_buf_start (&str, &buf, &str_start);
 384       if (scm_i_is_narrow_string (str))
 385         {
 386           my_buf = make_stringbuf (len);
 387           memcpy (STRINGBUF_CHARS (my_buf),
 388                   STRINGBUF_CHARS (buf) + str_start + start, len);
 389         }
 390       else
 391         {
 392           my_buf = make_wide_stringbuf (len);
 393           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
 394                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
 395                                      + start), len);
 396           wide = 1;
 397         }
 398       scm_remember_upto_here_1 (buf);
 399       substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
 400                                 (scm_t_bits) 0, (scm_t_bits) len);
 401       if (wide)
 402         scm_i_try_narrow_string (substr);
 403       return substr;
 404     }
 405 }
 406
 407 SCM
 408 scm_i_substring_shared (SCM str, size_t start, size_t end)
 409 {
 410   if (start == 0 && end == STRING_LENGTH (str))
 411     return str;
 412   else if (start == end)
 413     return scm_i_make_string (0, NULL, 0);
 414   else
 415     {
 416       size_t len = end - start;
 417       if (IS_SH_STRING (str))
 418         {
 419           start += STRING_START (str);
 420           str = SH_STRING_STRING (str);
 421         }
 422       return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
 423                               (scm_t_bits)start, (scm_t_bits) len);
 424     }
 425 }
 426
 427 SCM
 428 scm_c_substring (SCM str, size_t start, size_t end)
 429 {
 430   validate_substring_args (str, start, end);
 431   return scm_i_substring (str, start, end);
 432 }
 433
 434 SCM
 435 scm_c_substring_read_only (SCM str, size_t start, size_t end)
 436 {
 437   validate_substring_args (str, start, end);
 438   return scm_i_substring_read_only (str, start, end);
 439 }
 440
 441 SCM
 442 scm_c_substring_copy (SCM str, size_t start, size_t end)
 443 {
 444   validate_substring_args (str, start, end);
 445   return scm_i_substring_copy (str, start, end);
 446 }
 447
 448 SCM
 449 scm_c_substring_shared (SCM str, size_t start, size_t end)
 450 {
 451   validate_substring_args (str, start, end);
 452   return scm_i_substring_shared (str, start, end);
 453 }
 454
 455 \f
 456 /* Internal accessors
 457  */
 458
 459 /* Returns the number of characters in STR.  This may be different
 460    than the memory size of the string storage.  */
 461 size_t
 462 scm_i_string_length (SCM str)
 463 {
 464   return STRING_LENGTH (str);
 465 }
 466
 467 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
 468    encoding.  False if it is 'wide', having a 32-bit UCS-4
 469    encoding.  */
 470 int
 471 scm_i_is_narrow_string (SCM str)
 472 {
 473   if (IS_SH_STRING (str))
 474     str = SH_STRING_STRING (str);
 475
 476   return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
 477 }
 478
 479 /* Try to coerce a string to be narrow.  It if is narrow already, do
 480    nothing.  If it is wide, shrink it to narrow if none of its
 481    characters are above 0xFF.  Return true if the string is narrow or
 482    was made to be narrow.  */
 483 int
 484 scm_i_try_narrow_string (SCM str)
 485 {
 486   if (IS_SH_STRING (str))
 487     str = SH_STRING_STRING (str);
 488
 489   SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
 490
 491   return scm_i_is_narrow_string (str);
 492 }
 493
 494 /* Return a pointer to the raw data of the string, which can be either Latin-1
 495    or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'.  */
 496 const void *
 497 scm_i_string_data (SCM str)
 498 {
 499   SCM buf;
 500   size_t start;
 501   const char *data;
 502
 503   get_str_buf_start (&str, &buf, &start);
 504
 505   data = STRINGBUF_CONTENTS (buf);
 506   data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
 507
 508   return data;
 509 }
 510
 511 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
 512    STR.  */
 513 const char *
 514 scm_i_string_chars (SCM str)
 515 {
 516   SCM buf;
 517   size_t start;
 518   get_str_buf_start (&str, &buf, &start);
 519   if (scm_i_is_narrow_string (str))
 520     return (const char *) STRINGBUF_CHARS (buf) + start;
 521   else
 522     scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
 523                     scm_list_1 (str));
 524   return NULL;
 525 }
 526
 527 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
 528    STR.  */
 529 const scm_t_wchar *
 530 scm_i_string_wide_chars (SCM str)
 531 {
 532   SCM buf;
 533   size_t start;
 534
 535   get_str_buf_start (&str, &buf, &start);
 536   if (!scm_i_is_narrow_string (str))
 537     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
 538   else
 539     scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
 540                     scm_list_1 (str));
 541 }
 542
 543 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
 544    a new string buffer, so that it can be modified without modifying
 545    other strings.  Also, lock the string mutex.  Later, one must call
 546    scm_i_string_stop_writing to unlock the mutex.  */
 547 SCM
 548 scm_i_string_start_writing (SCM orig_str)
 549 {
 550   SCM buf, str = orig_str;
 551   size_t start;
 552
 553   get_str_buf_start (&str, &buf, &start);
 554   if (IS_RO_STRING (str))
 555     scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
 556
 557   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 558   if (STRINGBUF_SHARED (buf))
 559     {
 560       /* Clone the stringbuf.  */
 561       size_t len = STRING_LENGTH (str);
 562       SCM new_buf;
 563
 564       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 565
 566       if (scm_i_is_narrow_string (str))
 567         {
 568           new_buf = make_stringbuf (len);
 569           memcpy (STRINGBUF_CHARS (new_buf),
 570                   STRINGBUF_CHARS (buf) + STRING_START (str), len);
 571
 572         }
 573       else
 574         {
 575           new_buf = make_wide_stringbuf (len);
 576           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 577                    (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
 578                                      + STRING_START (str)), len);
 579         }
 580
 581       SET_STRING_STRINGBUF (str, new_buf);
 582       start -= STRING_START (str);
 583
 584       /* FIXME: The following operations are not atomic, so other threads
 585          looking at STR may see an inconsistent state.  Nevertheless it can't
 586          hurt much since (i) accessing STR while it is being mutated can't
 587          yield a crash, and (ii) concurrent accesses to STR should be
 588          protected by a mutex at the application level.  The latter may not
 589          apply when STR != ORIG_STR, though.  */
 590       SET_STRING_START (str, 0);
 591       SET_STRING_STRINGBUF (str, new_buf);
 592
 593       buf = new_buf;
 594
 595       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 596     }
 597   return orig_str;
 598 }
 599
 600 /* Return a pointer to the 8-bit Latin-1 chars of a string.  */
 601 char *
 602 scm_i_string_writable_chars (SCM str)
 603 {
 604   SCM buf;
 605   size_t start;
 606
 607   get_str_buf_start (&str, &buf, &start);
 608   if (scm_i_is_narrow_string (str))
 609     return (char *) STRINGBUF_CHARS (buf) + start;
 610   else
 611     scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
 612                     scm_list_1 (str));
 613   return NULL;
 614 }
 615
 616 /* Return a pointer to the UCS-4 codepoints of a string.  */
 617 static scm_t_wchar *
 618 scm_i_string_writable_wide_chars (SCM str)
 619 {
 620   SCM buf;
 621   size_t start;
 622
 623   get_str_buf_start (&str, &buf, &start);
 624   if (!scm_i_is_narrow_string (str))
 625     return STRINGBUF_WIDE_CHARS (buf) + start;
 626   else
 627     scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
 628                     scm_list_1 (str));
 629 }
 630
 631 /* Unlock the string mutex that was locked when
 632    scm_i_string_start_writing was called.  */
 633 void
 634 scm_i_string_stop_writing (void)
 635 {
 636   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 637 }
 638
 639 /* Return the Xth character of STR as a UCS-4 codepoint.  */
 640 scm_t_wchar
 641 scm_i_string_ref (SCM str, size_t x)
 642 {
 643   if (scm_i_is_narrow_string (str))
 644     return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
 645   else
 646     return scm_i_string_wide_chars (str)[x];
 647 }
 648
 649 /* Returns index+1 of the first char in STR that matches C, or
 650    0 if the char is not found.  */
 651 int
 652 scm_i_string_contains_char (SCM str, char ch)
 653 {
 654   size_t i;
 655   size_t len = scm_i_string_length (str);
 656
 657   i = 0;
 658   if (scm_i_is_narrow_string (str))
 659     {
 660       while (i < len)
 661         {
 662           if (scm_i_string_chars (str)[i] == ch)
 663             return i+1;
 664           i++;
 665         }
 666     }
 667   else
 668     {
 669       while (i < len)
 670         {
 671           if (scm_i_string_wide_chars (str)[i]
 672               == (unsigned char) ch)
 673             return i+1;
 674           i++;
 675         }
 676     }
 677   return 0;
 678 }
 679
 680 int
 681 scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
 682 {
 683   if (scm_i_is_narrow_string (sstr))
 684     {
 685       const char *a = scm_i_string_chars (sstr) + start_x;
 686       const char *b = cstr;
 687       return strncmp (a, b, strlen(b));
 688     }
 689   else
 690     {
 691       size_t i;
 692       const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
 693       const char *b = cstr;
 694       for (i = 0; i < strlen (b); i++)
 695         {
 696           if (a[i] != (unsigned char) b[i])
 697             return 1;
 698         }
 699     }
 700   return 0;
 701 }
 702
 703 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
 704 void
 705 scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
 706 {
 707   if (IS_SH_STRING (str))
 708     {
 709       p += STRING_START (str);
 710       str = SH_STRING_STRING (str);
 711     }
 712
 713   if (chr > 0xFF && scm_i_is_narrow_string (str))
 714     SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
 715
 716   if (scm_i_is_narrow_string (str))
 717     {
 718       char *dst = scm_i_string_writable_chars (str);
 719       dst[p] = chr;
 720     }
 721   else
 722     {
 723       scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
 724       dst[p] = chr;
 725     }
 726 }
 727
 728 \f
 729 /* Symbols.
 730
 731    Basic symbol creation and accessing is done here, the rest is in
 732    symbols.[hc].  This has been done to keep stringbufs and the
 733    internals of strings and string-like objects confined to this file.
 734 */
 735
 736 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
 737
 738 SCM
 739 scm_i_make_symbol (SCM name, scm_t_bits flags,
 740                    unsigned long hash, SCM props)
 741 {
 742   SCM buf;
 743   size_t start = STRING_START (name);
 744   size_t length = STRING_LENGTH (name);
 745
 746   if (IS_SH_STRING (name))
 747     {
 748       name = SH_STRING_STRING (name);
 749       start += STRING_START (name);
 750     }
 751   buf = STRING_STRINGBUF (name);
 752
 753   if (start == 0 && length == STRINGBUF_LENGTH (buf))
 754     {
 755       /* reuse buf. */
 756       scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 757       SET_STRINGBUF_SHARED (buf);
 758       scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 759     }
 760   else
 761     {
 762       /* make new buf. */
 763       if (scm_i_is_narrow_string (name))
 764         {
 765           SCM new_buf = make_stringbuf (length);
 766           memcpy (STRINGBUF_CHARS (new_buf),
 767                   STRINGBUF_CHARS (buf) + start, length);
 768           buf = new_buf;
 769         }
 770       else
 771         {
 772           SCM new_buf = make_wide_stringbuf (length);
 773           u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
 774                    (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
 775                    length);
 776           buf = new_buf;
 777         }
 778     }
 779   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 780                           (scm_t_bits) hash, SCM_UNPACK (props));
 781 }
 782
 783 SCM
 784 scm_i_c_make_symbol (const char *name, size_t len,
 785                      scm_t_bits flags, unsigned long hash, SCM props)
 786 {
 787   SCM buf = make_stringbuf (len);
 788   memcpy (STRINGBUF_CHARS (buf), name, len);
 789
 790   return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
 791                           (scm_t_bits) hash, SCM_UNPACK (props));
 792 }
 793
 794 /* Returns the number of characters in SYM.  This may be different
 795    from the memory size of SYM.  */
 796 size_t
 797 scm_i_symbol_length (SCM sym)
 798 {
 799   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 800 }
 801
 802 size_t
 803 scm_c_symbol_length (SCM sym)
 804 #define FUNC_NAME "scm_c_symbol_length"
 805 {
 806   SCM_VALIDATE_SYMBOL (1, sym);
 807
 808   return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
 809 }
 810 #undef FUNC_NAME
 811
 812 /* True if the name of SYM is stored as a Latin-1 encoded string.
 813    False if it is stored as a 32-bit UCS-4-encoded string.  */
 814 int
 815 scm_i_is_narrow_symbol (SCM sym)
 816 {
 817   SCM buf;
 818
 819   buf = SYMBOL_STRINGBUF (sym);
 820   return !STRINGBUF_WIDE (buf);
 821 }
 822
 823 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
 824    contains the name of SYM.  */
 825 const char *
 826 scm_i_symbol_chars (SCM sym)
 827 {
 828   SCM buf;
 829
 830   buf = SYMBOL_STRINGBUF (sym);
 831   if (!STRINGBUF_WIDE (buf))
 832     return (const char *) STRINGBUF_CHARS (buf);
 833   else
 834     scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
 835                     scm_list_1 (sym));
 836 }
 837
 838 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
 839    symbol's name.  */
 840 const scm_t_wchar *
 841 scm_i_symbol_wide_chars (SCM sym)
 842 {
 843   SCM buf;
 844
 845   buf = SYMBOL_STRINGBUF (sym);
 846   if (STRINGBUF_WIDE (buf))
 847     return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
 848   else
 849     scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
 850                     scm_list_1 (sym));
 851 }
 852
 853 SCM
 854 scm_i_symbol_substring (SCM sym, size_t start, size_t end)
 855 {
 856   SCM buf = SYMBOL_STRINGBUF (sym);
 857   scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
 858   SET_STRINGBUF_SHARED (buf);
 859   scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
 860   return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
 861                           (scm_t_bits)start, (scm_t_bits) end - start);
 862 }
 863
 864 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint.  */
 865 scm_t_wchar
 866 scm_i_symbol_ref (SCM sym, size_t x)
 867 {
 868   if (scm_i_is_narrow_symbol (sym))
 869     return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
 870   else
 871     return scm_i_symbol_wide_chars (sym)[x];
 872 }
 873
 874 /* Debugging
 875  */
 876
 877 SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
 878             "Returns an association list containing debugging information\n"
 879             "for @var{str}. The association list has the following entries."
 880             "@table @code\n"
 881             "@item string\n"
 882             "The string itself.\n"
 883             "@item start\n"
 884             "The start index of the string into its stringbuf\n"
 885             "@item length\n"
 886             "The length of the string\n"
 887             "@item shared\n"
 888             "If this string is a substring, it returns its parent string.\n"
 889             "Otherwise, it returns @code{#f}\n"
 890             "@item read-only\n"
 891             "@code{#t} if the string is read-only\n"
 892             "@item stringbuf-chars\n"
 893             "A new string containing this string's stringbuf's characters\n"
 894             "@item stringbuf-length\n"
 895             "The number of characters in this stringbuf\n"
 896             "@item stringbuf-shared\n"
 897             "@code{#t} if this stringbuf is shared\n"
 898             "@item stringbuf-wide\n"
 899             "@code{#t} if this stringbuf's characters are stored in a\n"
 900             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 901             "buffer\n"
 902             "@end table")
 903 #define FUNC_NAME s_scm_sys_string_dump
 904 {
 905   SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
 906   SCM buf;
 907   SCM_VALIDATE_STRING (1, str);
 908
 909   /* String info */
 910   e1 = scm_cons (scm_from_latin1_symbol ("string"),
 911                  str);
 912   e2 = scm_cons (scm_from_latin1_symbol ("start"),
 913                  scm_from_size_t (STRING_START (str)));
 914   e3 = scm_cons (scm_from_latin1_symbol ("length"),
 915                  scm_from_size_t (STRING_LENGTH (str)));
 916
 917   if (IS_SH_STRING (str))
 918     {
 919       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 920                      SH_STRING_STRING (str));
 921       buf = STRING_STRINGBUF (SH_STRING_STRING (str));
 922     }
 923   else
 924     {
 925       e4 = scm_cons (scm_from_latin1_symbol ("shared"),
 926                      SCM_BOOL_F);
 927       buf = STRING_STRINGBUF (str);
 928     }
 929
 930   if (IS_RO_STRING (str))
 931     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 932                    SCM_BOOL_T);
 933   else
 934     e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
 935                    SCM_BOOL_F);
 936
 937   /* Stringbuf info */
 938   if (!STRINGBUF_WIDE (buf))
 939     {
 940       size_t len = STRINGBUF_LENGTH (buf);
 941       char *cbuf;
 942       SCM sbc = scm_i_make_string (len, &cbuf, 0);
 943       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
 944       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 945                      sbc);
 946     }
 947   else
 948     {
 949       size_t len = STRINGBUF_LENGTH (buf);
 950       scm_t_wchar *cbuf;
 951       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
 952       u32_cpy ((scm_t_uint32 *) cbuf,
 953                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
 954       e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
 955                      sbc);
 956     }
 957   e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
 958                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
 959   if (STRINGBUF_SHARED (buf))
 960     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 961                    SCM_BOOL_T);
 962   else
 963     e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
 964                    SCM_BOOL_F);
 965   if (STRINGBUF_WIDE (buf))
 966     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 967                    SCM_BOOL_T);
 968   else
 969     e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
 970                    SCM_BOOL_F);
 971
 972   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
 973 }
 974 #undef FUNC_NAME
 975
 976 SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
 977             "Returns an association list containing debugging information\n"
 978             "for @var{sym}. The association list has the following entries."
 979             "@table @code\n"
 980             "@item symbol\n"
 981             "The symbol itself\n"
 982             "@item hash\n"
 983             "Its hash value\n"
 984             "@item interned\n"
 985             "@code{#t} if it is an interned symbol\n"
 986             "@item stringbuf-chars\n"
 987             "A new string containing this symbols's stringbuf's characters\n"
 988             "@item stringbuf-length\n"
 989             "The number of characters in this stringbuf\n"
 990             "@item stringbuf-shared\n"
 991             "@code{#t} if this stringbuf is shared\n"
 992             "@item stringbuf-wide\n"
 993             "@code{#t} if this stringbuf's characters are stored in a\n"
 994             "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
 995             "buffer\n"
 996             "@end table")
 997 #define FUNC_NAME s_scm_sys_symbol_dump
 998 {
 999   SCM e1, e2, e3, e4, e5, e6, e7;
1000   SCM buf;
1001   SCM_VALIDATE_SYMBOL (1, sym);
1002   e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
1003                  sym);
1004   e2 = scm_cons (scm_from_latin1_symbol ("hash"),
1005                  scm_from_ulong (scm_i_symbol_hash (sym)));
1006   e3 = scm_cons (scm_from_latin1_symbol ("interned"),
1007                  scm_symbol_interned_p (sym));
1008   buf = SYMBOL_STRINGBUF (sym);
1009
1010   /* Stringbuf info */
1011   if (!STRINGBUF_WIDE (buf))
1012     {
1013       size_t len = STRINGBUF_LENGTH (buf);
1014       char *cbuf;
1015       SCM sbc = scm_i_make_string (len, &cbuf, 0);
1016       memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1017       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1018                      sbc);
1019     }
1020   else
1021     {
1022       size_t len = STRINGBUF_LENGTH (buf);
1023       scm_t_wchar *cbuf;
1024       SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
1025       u32_cpy ((scm_t_uint32 *) cbuf,
1026                (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1027       e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
1028                      sbc);
1029     }
1030   e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
1031                  scm_from_size_t (STRINGBUF_LENGTH (buf)));
1032   if (STRINGBUF_SHARED (buf))
1033     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1034                    SCM_BOOL_T);
1035   else
1036     e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1037                    SCM_BOOL_F);
1038   if (STRINGBUF_WIDE (buf))
1039     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1040                     SCM_BOOL_T);
1041   else
1042     e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1043                     SCM_BOOL_F);
1044   return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
1045
1046 }
1047 #undef FUNC_NAME
1048
1049 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1050
1051 SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
1052 #define FUNC_NAME s_scm_sys_stringbuf_hist
1053 {
1054   int i;
1055   for (i = 0; i < 1000; i++)
1056     if (lenhist[i])
1057       fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1058   fprintf (stderr, ">999: %u\n", lenhist[1000]);
1059   return SCM_UNSPECIFIED;
1060 }
1061 #undef FUNC_NAME
1062
1063 #endif
1064
1065 \f
1066
1067 SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1068             (SCM obj),
1069             "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1070 #define FUNC_NAME s_scm_string_p
1071 {
1072   return scm_from_bool (IS_STRING (obj));
1073 }
1074 #undef FUNC_NAME
1075
1076
1077 SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1078
1079 SCM_DEFINE (scm_string, "string", 0, 0, 1,
1080             (SCM chrs),
1081             "@deffnx {Scheme Procedure} list->string chrs\n"
1082             "Return a newly allocated string composed of the arguments,\n"
1083             "@var{chrs}.")
1084 #define FUNC_NAME s_scm_string
1085 {
1086   SCM result = SCM_BOOL_F;
1087   SCM rest;
1088   size_t len;
1089   size_t p = 0;
1090   long i;
1091   int wide = 0;
1092
1093   /* Verify that this is a list of chars.  */
1094   i = scm_ilength (chrs);
1095   SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1096
1097   len = (size_t) i;
1098   rest = chrs;
1099
1100   while (len > 0 && scm_is_pair (rest))
1101     {
1102       SCM elt = SCM_CAR (rest);
1103       SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1104       if (SCM_CHAR (elt) > 0xFF)
1105         wide = 1;
1106       rest = SCM_CDR (rest);
1107       len--;
1108       scm_remember_upto_here_1 (elt);
1109     }
1110
1111   /* Construct a string containing this list of chars.  */
1112   len = (size_t) i;
1113   rest = chrs;
1114
1115   if (wide == 0)
1116     {
1117       char *buf;
1118
1119       result = scm_i_make_string (len, NULL, 0);
1120       result = scm_i_string_start_writing (result);
1121       buf = scm_i_string_writable_chars (result);
1122       while (len > 0 && scm_is_pair (rest))
1123         {
1124           SCM elt = SCM_CAR (rest);
1125           buf[p] = (unsigned char) SCM_CHAR (elt);
1126           p++;
1127           rest = SCM_CDR (rest);
1128           len--;
1129           scm_remember_upto_here_1 (elt);
1130         }
1131     }
1132   else
1133     {
1134       scm_t_wchar *buf;
1135
1136       result = scm_i_make_wide_string (len, NULL, 0);
1137       result = scm_i_string_start_writing (result);
1138       buf = scm_i_string_writable_wide_chars (result);
1139       while (len > 0 && scm_is_pair (rest))
1140         {
1141           SCM elt = SCM_CAR (rest);
1142           buf[p] = SCM_CHAR (elt);
1143           p++;
1144           rest = SCM_CDR (rest);
1145           len--;
1146           scm_remember_upto_here_1 (elt);
1147         }
1148     }
1149   scm_i_string_stop_writing ();
1150
1151   if (len > 0)
1152     scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1153   if (!scm_is_null (rest))
1154     scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1155
1156   return result;
1157 }
1158 #undef FUNC_NAME
1159
1160 SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1161             (SCM k, SCM chr),
1162             "Return a newly allocated string of\n"
1163             "length @var{k}.  If @var{chr} is given, then all elements of\n"
1164             "the string are initialized to @var{chr}, otherwise the contents\n"
1165             "of the string are all set to @code{#\nul}.")
1166 #define FUNC_NAME s_scm_make_string
1167 {
1168   return scm_c_make_string (scm_to_size_t (k), chr);
1169 }
1170 #undef FUNC_NAME
1171
1172 SCM
1173 scm_c_make_string (size_t len, SCM chr)
1174 #define FUNC_NAME NULL
1175 {
1176   size_t p;
1177   char *contents = NULL;
1178   SCM res = scm_i_make_string (len, &contents, 0);
1179
1180   /* If no char is given, initialize string contents to NULL.  */
1181   if (SCM_UNBNDP (chr))
1182     memset (contents, 0, len);
1183   else
1184     {
1185       SCM_VALIDATE_CHAR (0, chr);
1186       res = scm_i_string_start_writing (res);
1187       for (p = 0; p < len; p++)
1188         scm_i_string_set_x (res, p, SCM_CHAR (chr));
1189       scm_i_string_stop_writing ();
1190     }
1191
1192   return res;
1193 }
1194 #undef FUNC_NAME
1195
1196 SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1197             (SCM string),
1198             "Return the number of characters in @var{string}.")
1199 #define FUNC_NAME s_scm_string_length
1200 {
1201   SCM_VALIDATE_STRING (1, string);
1202   return scm_from_size_t (STRING_LENGTH (string));
1203 }
1204 #undef FUNC_NAME
1205
1206 SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1207             (SCM string),
1208             "Return the bytes used to represent a character in @var{string}."
1209             "This will return 1 or 4.")
1210 #define FUNC_NAME s_scm_string_bytes_per_char
1211 {
1212   SCM_VALIDATE_STRING (1, string);
1213   if (!scm_i_is_narrow_string (string))
1214     return scm_from_int (4);
1215
1216   return scm_from_int (1);
1217 }
1218 #undef FUNC_NAME
1219
1220 size_t
1221 scm_c_string_length (SCM string)
1222 {
1223   if (!IS_STRING (string))
1224     scm_wrong_type_arg_msg (NULL, 0, string, "string");
1225   return STRING_LENGTH (string);
1226 }
1227
1228 SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1229             (SCM str, SCM k),
1230             "Return character @var{k} of @var{str} using zero-origin\n"
1231             "indexing. @var{k} must be a valid index of @var{str}.")
1232 #define FUNC_NAME s_scm_string_ref
1233 {
1234   size_t len;
1235   unsigned long idx;
1236
1237   SCM_VALIDATE_STRING (1, str);
1238
1239   len = scm_i_string_length (str);
1240   if (SCM_LIKELY (len > 0))
1241     idx = scm_to_unsigned_integer (k, 0, len - 1);
1242   else
1243     scm_out_of_range (NULL, k);
1244
1245   if (scm_i_is_narrow_string (str))
1246     return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1247   else
1248     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1249 }
1250 #undef FUNC_NAME
1251
1252 SCM
1253 scm_c_string_ref (SCM str, size_t p)
1254 {
1255   if (p >= scm_i_string_length (str))
1256     scm_out_of_range (NULL, scm_from_size_t (p));
1257   if (scm_i_is_narrow_string (str))
1258     return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1259   else
1260     return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1261
1262 }
1263
1264 SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1265             (SCM str, SCM k, SCM chr),
1266             "Store @var{chr} in element @var{k} of @var{str} and return\n"
1267             "an unspecified value. @var{k} must be a valid index of\n"
1268             "@var{str}.")
1269 #define FUNC_NAME s_scm_string_set_x
1270 {
1271   size_t len;
1272   unsigned long idx;
1273
1274   SCM_VALIDATE_STRING (1, str);
1275
1276   len = scm_i_string_length (str);
1277   if (SCM_LIKELY (len > 0))
1278     idx = scm_to_unsigned_integer (k, 0, len - 1);
1279   else
1280     scm_out_of_range (NULL, k);
1281
1282   SCM_VALIDATE_CHAR (3, chr);
1283   str = scm_i_string_start_writing (str);
1284   scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1285   scm_i_string_stop_writing ();
1286
1287   return SCM_UNSPECIFIED;
1288 }
1289 #undef FUNC_NAME
1290
1291 void
1292 scm_c_string_set_x (SCM str, size_t p, SCM chr)
1293 {
1294   if (p >= scm_i_string_length (str))
1295     scm_out_of_range (NULL, scm_from_size_t (p));
1296   str = scm_i_string_start_writing (str);
1297   scm_i_string_set_x (str, p, SCM_CHAR (chr));
1298   scm_i_string_stop_writing ();
1299 }
1300
1301 SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1302             (SCM str, SCM start, SCM end),
1303             "Return a newly allocated string formed from the characters\n"
1304             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1305             "ending with index @var{end} (exclusive).\n"
1306             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1307             "exact integers satisfying:\n\n"
1308             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1309 #define FUNC_NAME s_scm_substring
1310 {
1311   size_t len, from, to;
1312
1313   SCM_VALIDATE_STRING (1, str);
1314   len = scm_i_string_length (str);
1315   from = scm_to_unsigned_integer (start, 0, len);
1316   if (SCM_UNBNDP (end))
1317     to = len;
1318   else
1319     to = scm_to_unsigned_integer (end, from, len);
1320   return scm_i_substring (str, from, to);
1321 }
1322 #undef FUNC_NAME
1323
1324 SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1325             (SCM str, SCM start, SCM end),
1326             "Return a newly allocated string formed from the characters\n"
1327             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1328             "ending with index @var{end} (exclusive).\n"
1329             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1330             "exact integers satisfying:\n"
1331             "\n"
1332             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1333             "\n"
1334             "The returned string is read-only.\n")
1335 #define FUNC_NAME s_scm_substring_read_only
1336 {
1337   size_t len, from, to;
1338
1339   SCM_VALIDATE_STRING (1, str);
1340   len = scm_i_string_length (str);
1341   from = scm_to_unsigned_integer (start, 0, len);
1342   if (SCM_UNBNDP (end))
1343     to = len;
1344   else
1345     to = scm_to_unsigned_integer (end, from, len);
1346   return scm_i_substring_read_only (str, from, to);
1347 }
1348 #undef FUNC_NAME
1349
1350 SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1351             (SCM str, SCM start, SCM end),
1352             "Return a newly allocated string formed from the characters\n"
1353             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1354             "ending with index @var{end} (exclusive).\n"
1355             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1356             "exact integers satisfying:\n\n"
1357             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1358 #define FUNC_NAME s_scm_substring_copy
1359 {
1360   /* For the Scheme version, START is mandatory, but for the C
1361      version, it is optional.  See scm_string_copy in srfi-13.c for a
1362      rationale.
1363   */
1364
1365   size_t from, to;
1366
1367   SCM_VALIDATE_STRING (1, str);
1368   scm_i_get_substring_spec (scm_i_string_length (str),
1369                             start, &from, end, &to);
1370   return scm_i_substring_copy (str, from, to);
1371 }
1372 #undef FUNC_NAME
1373
1374 SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1375             (SCM str, SCM start, SCM end),
1376             "Return string that indirectly refers to the characters\n"
1377             "of @var{str} beginning with index @var{start} (inclusive) and\n"
1378             "ending with index @var{end} (exclusive).\n"
1379             "@var{str} must be a string, @var{start} and @var{end} must be\n"
1380             "exact integers satisfying:\n\n"
1381             "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1382 #define FUNC_NAME s_scm_substring_shared
1383 {
1384   size_t len, from, to;
1385
1386   SCM_VALIDATE_STRING (1, str);
1387   len = scm_i_string_length (str);
1388   from = scm_to_unsigned_integer (start, 0, len);
1389   if (SCM_UNBNDP (end))
1390     to = len;
1391   else
1392     to = scm_to_unsigned_integer (end, from, len);
1393   return scm_i_substring_shared (str, from, to);
1394 }
1395 #undef FUNC_NAME
1396
1397 SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1398             (SCM args),
1399             "Return a newly allocated string whose characters form the\n"
1400             "concatenation of the given strings, @var{args}.")
1401 #define FUNC_NAME s_scm_string_append
1402 {
1403   SCM res;
1404   size_t len = 0;
1405   int wide = 0;
1406   SCM l, s;
1407   size_t i;
1408   union
1409   {
1410     char *narrow;
1411     scm_t_wchar *wide;
1412   } data;
1413
1414   SCM_VALIDATE_REST_ARGUMENT (args);
1415   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1416     {
1417       s = SCM_CAR (l);
1418       SCM_VALIDATE_STRING (SCM_ARGn, s);
1419       len += scm_i_string_length (s);
1420       if (!scm_i_is_narrow_string (s))
1421         wide = 1;
1422     }
1423   data.narrow = NULL;
1424   if (!wide)
1425     res = scm_i_make_string (len, &data.narrow, 0);
1426   else
1427     res = scm_i_make_wide_string (len, &data.wide, 0);
1428
1429   for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1430     {
1431       size_t len;
1432       s = SCM_CAR (l);
1433       SCM_VALIDATE_STRING (SCM_ARGn, s);
1434       len = scm_i_string_length (s);
1435       if (!wide)
1436         {
1437           memcpy (data.narrow, scm_i_string_chars (s), len);
1438           data.narrow += len;
1439         }
1440       else
1441         {
1442           if (scm_i_is_narrow_string (s))
1443             {
1444               for (i = 0; i < scm_i_string_length (s); i++)
1445                 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1446             }
1447           else
1448             u32_cpy ((scm_t_uint32 *) data.wide,
1449                      (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1450           data.wide += len;
1451         }
1452       scm_remember_upto_here_1 (s);
1453     }
1454   return res;
1455 }
1456 #undef FUNC_NAME
1457
1458
1459 \f
1460 /* Charset conversion error handling.  */
1461
1462 SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1463 SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1464
1465 /* Raise an exception informing that character CHR could not be written
1466    to PORT in its current encoding.  */
1467 void
1468 scm_encoding_error (const char *subr, int err, const char *message,
1469                     SCM port, SCM chr)
1470 {
1471   scm_throw (scm_encoding_error_key,
1472              scm_list_n (scm_from_latin1_string (subr),
1473                          scm_from_latin1_string (message),
1474                          scm_from_int (err),
1475                          port, chr,
1476                          SCM_UNDEFINED));
1477 }
1478
1479 /* Raise an exception informing of an encoding error on PORT.  This
1480    means that a character could not be written in PORT's encoding.  */
1481 void
1482 scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1483 {
1484   scm_throw (scm_decoding_error_key,
1485              scm_list_n (scm_from_latin1_string (subr),
1486                          scm_from_latin1_string (message),
1487                          scm_from_int (err),
1488                          port,
1489                          SCM_UNDEFINED));
1490 }
1491
1492 \f
1493 /* String conversion to/from C.  */
1494
1495 static void
1496 decoding_error (const char *func_name, int errno_save,
1497                 const char *str, size_t len)
1498 {
1499   /* Raise an error and pass the raw C string as a bytevector to the `throw'
1500      handler.  */
1501   SCM bv;
1502   signed char *buf;
1503
1504   buf = scm_gc_malloc_pointerless (len, "bytevector");
1505   memcpy (buf, str, len);
1506   bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1507
1508   scm_decoding_error (func_name, errno_save,
1509                       "input locale conversion error", bv);
1510 }
1511
1512 SCM
1513 scm_from_stringn (const char *str, size_t len, const char *encoding,
1514                   scm_t_string_failed_conversion_handler handler)
1515 {
1516   size_t u32len, i;
1517   scm_t_wchar *u32;
1518   int wide = 0;
1519   SCM res;
1520
1521   /* The order of these checks is important. */
1522   if (!str && len != 0)
1523     scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1524   if (len == (size_t) -1)
1525     len = strlen (str);
1526
1527   if (strcmp (encoding, "ISO-8859-1") == 0 || len == 0)
1528     return scm_from_latin1_stringn (str, len);
1529   else if (strcmp (encoding, "UTF-8") == 0
1530            && handler == SCM_FAILED_CONVERSION_ERROR)
1531     return scm_from_utf8_stringn (str, len);
1532
1533   u32len = 0;
1534   u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1535                                                 (enum iconv_ilseq_handler)
1536                                                 handler,
1537                                                 str, len,
1538                                                 NULL,
1539                                                 NULL, &u32len);
1540
1541   if (SCM_UNLIKELY (u32 == NULL))
1542     decoding_error (__func__, errno, str, len);
1543
1544   i = 0;
1545   while (i < u32len)
1546     if (u32[i++] > 0xFF)
1547       {
1548         wide = 1;
1549         break;
1550       }
1551
1552   if (!wide)
1553     {
1554       char *dst;
1555       res = scm_i_make_string (u32len, &dst, 0);
1556       for (i = 0; i < u32len; i ++)
1557         dst[i] = (unsigned char) u32[i];
1558       dst[u32len] = '\0';
1559     }
1560   else
1561     {
1562       scm_t_wchar *wdst;
1563       res = scm_i_make_wide_string (u32len, &wdst, 0);
1564       u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1565       wdst[u32len] = 0;
1566     }
1567
1568   free (u32);
1569   return res;
1570 }
1571
1572 SCM
1573 scm_from_locale_string (const char *str)
1574 {
1575   return scm_from_locale_stringn (str, -1);
1576 }
1577
1578 SCM
1579 scm_from_locale_stringn (const char *str, size_t len)
1580 {
1581   return scm_from_stringn (str, len, locale_charset (),
1582                            scm_i_default_port_conversion_handler ());
1583 }
1584
1585 SCM
1586 scm_from_latin1_string (const char *str)
1587 {
1588   return scm_from_latin1_stringn (str, -1);
1589 }
1590
1591 SCM
1592 scm_from_latin1_stringn (const char *str, size_t len)
1593 {
1594   char *buf;
1595   SCM result;
1596
1597   if (len == (size_t) -1)
1598     len = strlen (str);
1599
1600   /* Make a narrow string and copy STR as is.  */
1601   result = scm_i_make_string (len, &buf, 0);
1602   memcpy (buf, str, len);
1603
1604   return result;
1605 }
1606
1607 SCM
1608 scm_from_utf8_string (const char *str)
1609 {
1610   return scm_from_utf8_stringn (str, -1);
1611 }
1612
1613 SCM
1614 scm_from_utf8_stringn (const char *str, size_t len)
1615 {
1616   size_t i, char_len;
1617   const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1618   int ascii = 1, narrow = 1;
1619   SCM res;
1620
1621   if (len == (size_t) -1)
1622     len = strlen (str);
1623
1624   i = 0;
1625   char_len = 0;
1626
1627   while (i < len)
1628     {
1629       if (ustr[i] <= 127)
1630         {
1631           char_len++;
1632           i++;
1633         }
1634       else
1635         {
1636           ucs4_t c;
1637           int nbytes;
1638
1639           ascii = 0;
1640
1641           nbytes = u8_mbtouc (&c, ustr + i, len - i);
1642
1643           if (c == 0xfffd)
1644             /* Bad UTF-8.  */
1645             decoding_error (__func__, errno, str, len);
1646
1647           if (c > 255)
1648             narrow = 0;
1649
1650           char_len++;
1651           i += nbytes;
1652         }
1653     }
1654
1655   if (ascii)
1656     {
1657       char *dst;
1658       res = scm_i_make_string (char_len, &dst, 0);
1659       memcpy (dst, str, len);
1660     }
1661   else if (narrow)
1662     {
1663       char *dst;
1664       size_t j;
1665       ucs4_t c;
1666
1667       res = scm_i_make_string (char_len, &dst, 0);
1668
1669       for (i = 0, j = 0; i < len; j++)
1670         {
1671           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1672           dst[j] = (signed char) c;
1673         }
1674     }
1675   else
1676     {
1677       scm_t_wchar *dst;
1678       size_t j;
1679       ucs4_t c;
1680
1681       res = scm_i_make_wide_string (char_len, &dst, 0);
1682
1683       for (i = 0, j = 0; i < len; j++)
1684         {
1685           i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1686           dst[j] = c;
1687         }
1688     }
1689
1690   return res;
1691 }
1692
1693 SCM
1694 scm_from_utf32_string (const scm_t_wchar *str)
1695 {
1696   return scm_from_utf32_stringn (str, -1);
1697 }
1698
1699 SCM
1700 scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1701 {
1702   SCM result;
1703   scm_t_wchar *buf;
1704
1705   if (len == (size_t) -1)
1706     len = u32_strlen ((uint32_t *) str);
1707
1708   result = scm_i_make_wide_string (len, &buf, 0);
1709   memcpy (buf, str, len * sizeof (scm_t_wchar));
1710   scm_i_try_narrow_string (result);
1711
1712   return result;
1713 }
1714
1715 SCM
1716 scm_from_port_string (const char *str, SCM port)
1717 {
1718   return scm_from_port_stringn (str, -1, port);
1719 }
1720
1721 SCM
1722 scm_from_port_stringn (const char *str, size_t len, SCM port)
1723 {
1724   scm_t_port *pt = SCM_PTAB_ENTRY (port);
1725
1726   if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
1727     return scm_from_latin1_stringn (str, len);
1728   else if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
1729            && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
1730     return scm_from_utf8_stringn (str, len);
1731   else
1732     return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1733 }
1734
1735 /* Create a new scheme string from the C string STR.  The memory of
1736    STR may be used directly as storage for the new string.  */
1737 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1738    would be to register a finalizer to eventually free(3) STR, which isn't
1739    worth it.  Should we just deprecate the `scm_take_' functions?  */
1740 SCM
1741 scm_take_locale_stringn (char *str, size_t len)
1742 {
1743   SCM res;
1744
1745   res = scm_from_locale_stringn (str, len);
1746   free (str);
1747
1748   return res;
1749 }
1750
1751 SCM
1752 scm_take_locale_string (char *str)
1753 {
1754   return scm_take_locale_stringn (str, -1);
1755 }
1756
1757 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1758    *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1759    Set *LENP to the size of the resulting string.
1760
1761    FIXME: This is a hack we should get rid of.  See
1762    <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1763    for details.  */
1764 static void
1765 unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
1766 {
1767   char *before, *after;
1768   size_t i, j;
1769
1770   before = buf;
1771   after = buf;
1772   i = 0;
1773   j = 0;
1774   while (i < *lenp)
1775     {
1776       if ((i <= *lenp - 6)
1777           && before[i] == '\\'
1778           && before[i + 1] == 'u'
1779           && before[i + 2] == '0' && before[i + 3] == '0')
1780         {
1781           /* Convert \u00NN to \xNN */
1782           after[j] = '\\';
1783           after[j + 1] = 'x';
1784           after[j + 2] = tolower ((int) before[i + 4]);
1785           after[j + 3] = tolower ((int) before[i + 5]);
1786           i += 6;
1787           j += 4;
1788         }
1789       else if ((i <= *lenp - 10)
1790                && before[i] == '\\'
1791                && before[i + 1] == 'U'
1792                && before[i + 2] == '0' && before[i + 3] == '0')
1793         {
1794           /* Convert \U00NNNNNN to \UNNNNNN */
1795           after[j] = '\\';
1796           after[j + 1] = 'U';
1797           after[j + 2] = tolower ((int) before[i + 4]);
1798           after[j + 3] = tolower ((int) before[i + 5]);
1799           after[j + 4] = tolower ((int) before[i + 6]);
1800           after[j + 5] = tolower ((int) before[i + 7]);
1801           after[j + 6] = tolower ((int) before[i + 8]);
1802           after[j + 7] = tolower ((int) before[i + 9]);
1803           i += 10;
1804           j += 8;
1805         }
1806       else
1807         {
1808           after[j] = before[i];
1809           i++;
1810           j++;
1811         }
1812     }
1813   *lenp = j;
1814 }
1815
1816 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1817    *LENP-byte locale-encoded string, to `\xXXXX;'.  Set *LEN to the size
1818    of the resulting string.  BUF must be large enough to handle the
1819    worst case when `\uXXXX' escapes (6 characters) are replaced by
1820    `\xXXXX;' (7 characters).  */
1821 static void
1822 unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
1823 {
1824   char *before, *after;
1825   size_t i, j;
1826   /* The worst case is if the input string contains all 4-digit hex escapes.
1827      "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1828   size_t max_out_len = (*lenp * 7) / 6 + 1;
1829   size_t nzeros, ndigits;
1830
1831   before = buf;
1832   after = alloca (max_out_len);
1833   i = 0;
1834   j = 0;
1835   while (i < *lenp)
1836     {
1837       if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1838           || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1839         {
1840           if (before[i + 1] == 'u')
1841             ndigits = 4;
1842           else if (before[i + 1] == 'U')
1843             ndigits = 8;
1844           else
1845             abort ();
1846
1847           /* Add the R6RS hex escape initial sequence.  */
1848           after[j] = '\\';
1849           after[j + 1] = 'x';
1850
1851           /* Move string positions to the start of the hex numbers.  */
1852           i += 2;
1853           j += 2;
1854
1855           /* Find the number of initial zeros in this hex number.  */
1856           nzeros = 0;
1857           while (before[i + nzeros] == '0' && nzeros < ndigits)
1858             nzeros++;
1859
1860           /* Copy the number, skipping initial zeros, and then move the string
1861              positions.  */
1862           if (nzeros == ndigits)
1863             {
1864               after[j] = '0';
1865               i += ndigits;
1866               j += 1;
1867             }
1868           else
1869             {
1870               int pos;
1871               for (pos = 0; pos < ndigits - nzeros; pos++)
1872                 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1873               i += ndigits;
1874               j += (ndigits - nzeros);
1875             }
1876
1877           /* Add terminating semicolon.  */
1878           after[j] = ';';
1879           j++;
1880         }
1881       else
1882         {
1883           after[j] = before[i];
1884           i++;
1885           j++;
1886         }
1887     }
1888   *lenp = j;
1889   memcpy (before, after, j);
1890 }
1891
1892 char *
1893 scm_to_locale_string (SCM str)
1894 {
1895   return scm_to_locale_stringn (str, NULL);
1896 }
1897
1898 char *
1899 scm_to_locale_stringn (SCM str, size_t *lenp)
1900 {
1901   return scm_to_stringn (str, lenp,
1902                          locale_charset (),
1903                          scm_i_default_port_conversion_handler ());
1904 }
1905
1906 char *
1907 scm_to_latin1_string (SCM str)
1908 {
1909   return scm_to_latin1_stringn (str, NULL);
1910 }
1911
1912 char *
1913 scm_to_latin1_stringn (SCM str, size_t *lenp)
1914 #define FUNC_NAME "scm_to_latin1_stringn"
1915 {
1916   char *result;
1917
1918   SCM_VALIDATE_STRING (1, str);
1919
1920   if (scm_i_is_narrow_string (str))
1921     {
1922       size_t len = scm_i_string_length (str);
1923
1924       if (lenp)
1925         *lenp = len;
1926
1927       result = scm_strndup (scm_i_string_data (str), len);
1928     }
1929   else
1930     result = scm_to_stringn (str, lenp, NULL,
1931                              SCM_FAILED_CONVERSION_ERROR);
1932
1933   return result;
1934 }
1935 #undef FUNC_NAME
1936
1937 char *
1938 scm_to_utf8_string (SCM str)
1939 {
1940   return scm_to_utf8_stringn (str, NULL);
1941 }
1942
1943 static size_t
1944 latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1945 {
1946   size_t ret, i;
1947   for (i = 0, ret = 0; i < len; i++)
1948     ret += (str[i] < 128) ? 1 : 2;
1949   return ret;
1950 }
1951
1952 static scm_t_uint8*
1953 latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1954               scm_t_uint8 *u8_result, size_t *u8_lenp)
1955 {
1956   size_t i, n;
1957   size_t u8_len = latin1_u8_strlen (str, latin_len);
1958
1959   if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1960     u8_result = scm_malloc (u8_len + 1);
1961   if (u8_lenp)
1962     *u8_lenp = u8_len;
1963
1964   for (i = 0, n = 0; i < latin_len; i++)
1965     n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1966   if (n != u8_len)
1967     abort ();
1968   u8_result[n] = 0;
1969
1970   return u8_result;
1971 }
1972
1973 /* UTF-8 code table
1974
1975    (Note that this includes code points that are not allowed by Unicode,
1976     but since this function has no way to report an error, and its
1977     purpose is to determine the size of destination buffers for
1978     libunicode conversion functions, we err on the safe side and handle
1979     everything that libunicode might conceivably handle, now or in the
1980     future.)
1981
1982    Char. number range  |        UTF-8 octet sequence
1983       (hexadecimal)    |              (binary)
1984    --------------------+------------------------------------------------------
1985    0000 0000-0000 007F | 0xxxxxxx
1986    0000 0080-0000 07FF | 110xxxxx 10xxxxxx
1987    0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
1988    0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1989    0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1990    0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1991 */
1992
1993 static size_t
1994 u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
1995 {
1996   size_t ret, i;
1997
1998   for (i = 0, ret = 0; i < len; i++)
1999     {
2000       scm_t_uint32 c = str[i];
2001
2002       if (c <= 0x7f)
2003         ret += 1;
2004       else if (c <= 0x7ff)
2005         ret += 2;
2006       else if (c <= 0xffff)
2007         ret += 3;
2008       else if (c <= 0x1fffff)
2009         ret += 4;
2010       else if (c <= 0x3ffffff)
2011         ret += 5;
2012       else
2013         ret += 6;
2014     }
2015
2016   return ret;
2017 }
2018
2019 char *
2020 scm_to_utf8_stringn (SCM str, size_t *lenp)
2021 #define FUNC_NAME "scm_to_utf8_stringn"
2022 {
2023   SCM_VALIDATE_STRING (1, str);
2024
2025   if (scm_i_is_narrow_string (str))
2026     return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2027                                   scm_i_string_length (str),
2028                                   NULL, lenp);
2029   else
2030     {
2031       scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2032       scm_t_uint8 *buf, *ret;
2033       size_t num_chars = scm_i_string_length (str);
2034       size_t num_bytes_predicted, num_bytes_actual;
2035
2036       num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2037
2038       if (lenp)
2039         {
2040           *lenp = num_bytes_predicted;
2041           buf = scm_malloc (num_bytes_predicted);
2042         }
2043       else
2044         {
2045           buf = scm_malloc (num_bytes_predicted + 1);
2046           buf[num_bytes_predicted] = 0;
2047         }
2048
2049       num_bytes_actual = num_bytes_predicted;
2050       ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
2051
2052       if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
2053         return (char *) ret;
2054
2055       /* An error: a bad codepoint.  */
2056       {
2057         int saved_errno = errno;
2058
2059         free (buf);
2060         if (!saved_errno)
2061           abort ();
2062
2063         scm_decoding_error ("scm_to_utf8_stringn", errno,
2064                             "invalid codepoint in string", str);
2065
2066         /* Not reached.  */
2067         return NULL;
2068       }
2069     }
2070 }
2071 #undef FUNC_NAME
2072
2073 scm_t_wchar *
2074 scm_to_utf32_string (SCM str)
2075 {
2076   return scm_to_utf32_stringn (str, NULL);
2077 }
2078
2079 scm_t_wchar *
2080 scm_to_utf32_stringn (SCM str, size_t *lenp)
2081 #define FUNC_NAME "scm_to_utf32_stringn"
2082 {
2083   scm_t_wchar *result;
2084
2085   SCM_VALIDATE_STRING (1, str);
2086
2087   if (scm_i_is_narrow_string (str))
2088     {
2089       scm_t_uint8 *codepoints;
2090       size_t i, len;
2091
2092       codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2093       len = scm_i_string_length (str);
2094       if (lenp)
2095         *lenp = len;
2096
2097       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2098       for (i = 0; i < len; i++)
2099         result[i] = codepoints[i];
2100       result[len] = 0;
2101     }
2102   else
2103     {
2104       size_t len;
2105
2106       len = scm_i_string_length (str);
2107       if (lenp)
2108         *lenp = len;
2109
2110       result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2111       memcpy (result, scm_i_string_wide_chars (str),
2112               len * sizeof (scm_t_wchar));
2113       result[len] = 0;
2114     }
2115
2116   return result;
2117 }
2118 #undef FUNC_NAME
2119
2120 char *
2121 scm_to_port_string (SCM str, SCM port)
2122 {
2123   return scm_to_port_stringn (str, NULL, port);
2124 }
2125
2126 char *
2127 scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2128 {
2129   scm_t_port *pt = SCM_PTAB_ENTRY (port);
2130
2131   if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
2132       && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2133     return scm_to_latin1_stringn (str, lenp);
2134   else if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
2135     return scm_to_utf8_stringn (str, lenp);
2136   else
2137     return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2138 }
2139
2140 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2141    according to ENCODING.  If LENP is non-NULL, set it to the size in bytes of
2142    the returned buffer.  If the conversion to ENCODING fails, apply the strategy
2143    defined by HANDLER.  */
2144 char *
2145 scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
2146                 scm_t_string_failed_conversion_handler handler)
2147 {
2148   char *buf;
2149   size_t ilen, len, i;
2150   int ret;
2151   const char *enc;
2152
2153   if (!scm_is_string (str))
2154     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2155   ilen = scm_i_string_length (str);
2156
2157   if (ilen == 0)
2158     {
2159       buf = scm_malloc (1);
2160       buf[0] = '\0';
2161       if (lenp)
2162         *lenp = 0;
2163       return buf;
2164     }
2165
2166   if (lenp == NULL)
2167     for (i = 0; i < ilen; i++)
2168       if (scm_i_string_ref (str, i) == '\0')
2169         scm_misc_error (NULL,
2170                         "string contains #\\nul character: ~S",
2171                         scm_list_1 (str));
2172
2173   if (scm_i_is_narrow_string (str) && strcmp (encoding, "ISO-8859-1") == 0)
2174     {
2175       /* If using native Latin-1 encoding, just copy the string
2176          contents.  */
2177       if (lenp)
2178         {
2179           buf = scm_malloc (ilen);
2180           memcpy (buf, scm_i_string_chars (str), ilen);
2181           *lenp = ilen;
2182           return buf;
2183         }
2184       else
2185         {
2186           buf = scm_malloc (ilen + 1);
2187           memcpy (buf, scm_i_string_chars (str), ilen);
2188           buf[ilen] = '\0';
2189           return buf;
2190         }
2191     }
2192
2193
2194   buf = NULL;
2195   len = 0;
2196   enc = encoding;
2197   if (enc == NULL)
2198     enc = "ISO-8859-1";
2199   if (scm_i_is_narrow_string (str))
2200     {
2201       ret = mem_iconveh (scm_i_string_chars (str), ilen,
2202                          "ISO-8859-1", enc,
2203                          (enum iconv_ilseq_handler) handler, NULL,
2204                          &buf, &len);
2205
2206       if (ret != 0)
2207         scm_encoding_error (__func__, errno,
2208                             "cannot convert narrow string to output locale",
2209                             SCM_BOOL_F,
2210                             /* FIXME: Faulty character unknown.  */
2211                             SCM_BOOL_F);
2212     }
2213   else
2214     {
2215       buf = u32_conv_to_encoding (enc,
2216                                   (enum iconv_ilseq_handler) handler,
2217                                   (scm_t_uint32 *) scm_i_string_wide_chars (str),
2218                                   ilen,
2219                                   NULL,
2220                                   NULL, &len);
2221       if (buf == NULL)
2222         scm_encoding_error (__func__, errno,
2223                             "cannot convert wide string to output locale",
2224                             SCM_BOOL_F,
2225                             /* FIXME: Faulty character unknown.  */
2226                             SCM_BOOL_F);
2227     }
2228   if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2229     {
2230       if (SCM_R6RS_ESCAPES_P)
2231         {
2232           /* The worst case is if the input string contains all 4-digit
2233              hex escapes.  "\uXXXX" (six characters) becomes "\xXXXX;"
2234              (seven characters).  Make BUF large enough to hold
2235              that.  */
2236           buf = scm_realloc (buf, (len * 7) / 6 + 1);
2237           unistring_escapes_to_r6rs_escapes (buf, &len);
2238         }
2239       else
2240         unistring_escapes_to_guile_escapes (buf, &len);
2241
2242       buf = scm_realloc (buf, len);
2243     }
2244   if (lenp)
2245     *lenp = len;
2246   else
2247     {
2248       buf = scm_realloc (buf, len + 1);
2249       buf[len] = '\0';
2250     }
2251
2252   scm_remember_upto_here_1 (str);
2253   return buf;
2254 }
2255
2256 size_t
2257 scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2258 {
2259   size_t len;
2260   char *result = NULL;
2261   if (!scm_is_string (str))
2262     scm_wrong_type_arg_msg (NULL, 0, str, "string");
2263   result = scm_to_locale_stringn (str, &len);
2264
2265   memcpy (buf, result, (len > max_len) ? max_len : len);
2266   free (result);
2267
2268   scm_remember_upto_here_1 (str);
2269   return len;
2270 }
2271
2272 \f
2273 /* Unicode string normalization.  */
2274
2275 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2276    libguile/i18n.c.  It would be useful to have this factored out into a more
2277    convenient location, but its use of alloca makes that tricky to do. */
2278
2279 static SCM
2280 normalize_str (SCM string, uninorm_t form)
2281 {
2282   SCM ret;
2283   scm_t_uint32 *w_str;
2284   scm_t_wchar *cbuf;
2285   size_t rlen, len = scm_i_string_length (string);
2286
2287   if (scm_i_is_narrow_string (string))
2288     {
2289       size_t i;
2290       const char *buf = scm_i_string_chars (string);
2291
2292       w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2293
2294       for (i = 0; i < len; i ++)
2295         w_str[i] = (unsigned char) buf[i];
2296       w_str[len] = 0;
2297     }
2298   else
2299     w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2300
2301   w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2302
2303   ret = scm_i_make_wide_string (rlen, &cbuf, 0);
2304   u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2305   free (w_str);
2306
2307   scm_i_try_narrow_string (ret);
2308
2309   return ret;
2310 }
2311
2312 SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2313             (SCM string),
2314             "Returns the NFC normalized form of @var{string}.")
2315 #define FUNC_NAME s_scm_string_normalize_nfc
2316 {
2317   SCM_VALIDATE_STRING (1, string);
2318   return normalize_str (string, UNINORM_NFC);
2319 }
2320 #undef FUNC_NAME
2321
2322 SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2323             (SCM string),
2324             "Returns the NFD normalized form of @var{string}.")
2325 #define FUNC_NAME s_scm_string_normalize_nfd
2326 {
2327   SCM_VALIDATE_STRING (1, string);
2328   return normalize_str (string, UNINORM_NFD);
2329 }
2330 #undef FUNC_NAME
2331
2332 SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2333             (SCM string),
2334             "Returns the NFKC normalized form of @var{string}.")
2335 #define FUNC_NAME s_scm_string_normalize_nfkc
2336 {
2337   SCM_VALIDATE_STRING (1, string);
2338   return normalize_str (string, UNINORM_NFKC);
2339 }
2340 #undef FUNC_NAME
2341
2342 SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2343             (SCM string),
2344             "Returns the NFKD normalized form of @var{string}.")
2345 #define FUNC_NAME s_scm_string_normalize_nfkd
2346 {
2347   SCM_VALIDATE_STRING (1, string);
2348   return normalize_str (string, UNINORM_NFKD);
2349 }
2350 #undef FUNC_NAME
2351
2352 /* converts C scm_array of strings to SCM scm_list of strings.
2353    If argc < 0, a null terminated scm_array is assumed.
2354    The current locale encoding is assumed */
2355 SCM
2356 scm_makfromstrs (int argc, char **argv)
2357 {
2358   int i = argc;
2359   SCM lst = SCM_EOL;
2360   if (0 > i)
2361     for (i = 0; argv[i]; i++);
2362   while (i--)
2363     lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2364   return lst;
2365 }
2366
2367 /* Return a newly allocated array of char pointers to each of the strings
2368    in args, with a terminating NULL pointer.  The strings are encoded using
2369    the current locale. */
2370
2371 char **
2372 scm_i_allocate_string_pointers (SCM list)
2373 #define FUNC_NAME "scm_i_allocate_string_pointers"
2374 {
2375   char **result;
2376   int list_len = scm_ilength (list);
2377   int i;
2378
2379   if (list_len < 0)
2380     scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2381
2382   result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2383                           "string pointers");
2384   result[list_len] = NULL;
2385
2386   /* The list might have been modified in another thread, so
2387      we check LIST before each access.
2388    */
2389   for (i = 0; i < list_len && scm_is_pair (list); i++)
2390     {
2391       SCM str = SCM_CAR (list);
2392       size_t len;  /* String length in bytes */
2393       char *c_str = scm_to_locale_stringn (str, &len);
2394
2395       /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2396          scm_malloc to allocate the returned string, which must be
2397          explicitly deallocated.  This forces us to copy the string a
2398          second time into a new buffer.  Ideally there would be variants
2399          of scm_to_*_stringn that can return garbage-collected buffers. */
2400
2401       result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2402       memcpy (result[i], c_str, len);
2403       result[i][len] = '\0';
2404       free (c_str);
2405
2406       list = SCM_CDR (list);
2407     }
2408
2409   return result;
2410 }
2411 #undef FUNC_NAME
2412
2413 void
2414 scm_i_get_substring_spec (size_t len,
2415                           SCM start, size_t *cstart,
2416                           SCM end, size_t *cend)
2417 {
2418   if (SCM_UNBNDP (start))
2419     *cstart = 0;
2420   else
2421     *cstart = scm_to_unsigned_integer (start, 0, len);
2422
2423   if (SCM_UNBNDP (end))
2424     *cend = len;
2425   else
2426     *cend = scm_to_unsigned_integer (end, *cstart, len);
2427 }
2428
2429 static SCM
2430 string_handle_ref (scm_t_array_handle *h, size_t index)
2431 {
2432   return scm_c_string_ref (h->array, index);
2433 }
2434
2435 static void
2436 string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2437 {
2438   scm_c_string_set_x (h->array, index, val);
2439 }
2440
2441 static void
2442 string_get_handle (SCM v, scm_t_array_handle *h)
2443 {
2444   h->array = v;
2445   h->ndims = 1;
2446   h->dims = &h->dim0;
2447   h->dim0.lbnd = 0;
2448   h->dim0.ubnd = scm_c_string_length (v) - 1;
2449   h->dim0.inc = 1;
2450   h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2451   h->elements = h->writable_elements = NULL;
2452 }
2453
2454 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2455                           string_handle_ref, string_handle_set,
2456                           string_get_handle)
2457 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2458
2459 void
2460 scm_init_strings ()
2461 {
2462   scm_nullstr = scm_i_make_string (0, NULL, 0);
2463
2464 #include "libguile/strings.x"
2465 }
2466
2467
2468 /*
2469   Local Variables:
2470   c-file-style: "gnu"
2471   End:
2472 */