Regression, scm_string fails to test for circular lists
[bpt/guile.git] / libguile / strings.c
CommitLineData
50b1996f 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45
MG
27#include <ctype.h>
28#include <unistr.h>
faf2c9d7 29
a0599745
MD
30#include "libguile/_scm.h"
31#include "libguile/chars.h"
7c33806a 32#include "libguile/root.h"
a0599745 33#include "libguile/strings.h"
1afff620 34#include "libguile/deprecation.h"
a0599745 35#include "libguile/validate.h"
c829a427 36#include "libguile/dynwind.h"
1afff620 37
0f2d19dd
JB
38\f
39
40/* {Strings}
41 */
42
3ee86942
MV
43
44/* Stringbufs
45 *
46 * XXX - keeping an accurate refcount during GC seems to be quite
47 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 48 * shared, not whether it definitely is.
3ee86942
MV
49 *
50 * The scheme I (mvo) tried to keep an accurate reference count would
51 * recount all strings that point to a stringbuf during the mark-phase
52 * of the GC. This was done since one cannot access the stringbuf of
53 * a string when that string is freed (in order to decrease the
54 * reference count). The memory of the stringbuf might have been
55 * reused already for something completely different.
56 *
57 * This recounted worked for a small number of threads beating on
58 * cow-strings, but it failed randomly with more than 10 threads, say.
59 * I couldn't figure out what went wrong, so I used the conservative
60 * approach implemented below.
61 *
62 * A stringbuf needs to know its length, but only so that it can be
63 * reported when the stringbuf is freed.
64 *
50b1996f
MG
65 * There are 3 storage strategies for stringbufs: inline, outline, and
66 * wide.
67 *
68 * Inline strings are small 8-bit strings stored within the double
69 * cell itself. Outline strings are larger 8-bit strings with GC
70 * allocated storage. Wide strings are 32-bit strings with allocated
71 * storage.
72 *
73 * There was little value in making wide string inlineable, since
74 * there is only room for three inlined 32-bit characters. Thus wide
75 * stringbufs are never inlined.
3ee86942
MV
76 */
77
78#define STRINGBUF_F_SHARED 0x100
79#define STRINGBUF_F_INLINE 0x200
50b1996f
MG
80#define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
81 encoding. Otherwise, strings
82 are Latin-1. */
3ee86942
MV
83
84#define STRINGBUF_TAG scm_tc7_stringbuf
85#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
86#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
9c44cd45 87#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942
MV
88
89#define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
90#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
91#define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
92#define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
93
94#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
95 ? STRINGBUF_INLINE_CHARS (buf) \
96 : STRINGBUF_OUTLINE_CHARS (buf))
50b1996f 97
9c44cd45 98#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
3ee86942
MV
99#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
100 ? STRINGBUF_INLINE_LENGTH (buf) \
101 : STRINGBUF_OUTLINE_LENGTH (buf))
102
103#define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
104
105#define SET_STRINGBUF_SHARED(buf) \
106 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
107
6ce6923b 108#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
109static size_t lenhist[1001];
110#endif
111
50b1996f
MG
112/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
113 characters. */
3ee86942
MV
114static SCM
115make_stringbuf (size_t len)
0f2d19dd 116{
3ee86942
MV
117 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
118 scm_i_symbol_chars, all stringbufs are null-terminated. Once
119 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
120 has been changed for scm_i_symbol_chars, this null-termination
121 can be dropped.
122 */
123
6ce6923b 124#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
125 if (len < 1000)
126 lenhist[len]++;
127 else
128 lenhist[1000]++;
129#endif
0f2d19dd 130
3ee86942
MV
131 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
132 {
133 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
134 0, 0, 0);
135 }
136 else
137 {
138 char *mem = scm_gc_malloc (len+1, "string");
139 mem[len] = '\0';
140 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
141 (scm_t_bits) len, (scm_t_bits) 0);
142 }
143}
e53cc817 144
50b1996f
MG
145/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 characters. */
9c44cd45
MG
147static SCM
148make_wide_stringbuf (size_t len)
149{
150 scm_t_wchar *mem;
6ce6923b 151#if SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156#endif
157
158 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
159 mem[len] = 0;
160 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
161 (scm_t_bits) len, (scm_t_bits) 0);
162}
163
2b829bbb
KR
164/* Return a new stringbuf whose underlying storage consists of the LEN+1
165 octets pointed to by STR (the last octet is zero). */
7f74cf9a 166SCM
fd0a5bbc
HWN
167scm_i_take_stringbufn (char *str, size_t len)
168{
2b829bbb 169 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
fd0a5bbc
HWN
170
171 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
172 (scm_t_bits) len, (scm_t_bits) 0);
173}
174
3ee86942
MV
175SCM
176scm_i_stringbuf_mark (SCM buf)
177{
178 return SCM_BOOL_F;
179}
1bbd0b84 180
3ee86942
MV
181void
182scm_i_stringbuf_free (SCM buf)
0f2d19dd 183{
3ee86942 184 if (!STRINGBUF_INLINE (buf))
9c44cd45
MG
185 {
186 if (!STRINGBUF_WIDE (buf))
187 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
188 STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
189 else
190 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
191 sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
192 + 1), "string");
193 }
194
195}
196
50b1996f
MG
197/* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
198 one containing 32-bit UCS-4-encoded characters. */
9c44cd45
MG
199static void
200widen_stringbuf (SCM buf)
201{
202 size_t i, len;
203 scm_t_wchar *mem;
204
205 if (STRINGBUF_WIDE (buf))
206 return;
207
208 if (STRINGBUF_INLINE (buf))
209 {
210 len = STRINGBUF_INLINE_LENGTH (buf);
211
212 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
213 for (i = 0; i < len; i++)
214 mem[i] =
215 (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
216 mem[len] = 0;
217
218 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
219 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
220 SCM_SET_CELL_WORD_1 (buf, mem);
221 SCM_SET_CELL_WORD_2 (buf, len);
222 }
223 else
224 {
225 len = STRINGBUF_OUTLINE_LENGTH (buf);
226
227 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
228 for (i = 0; i < len; i++)
229 mem[i] =
230 (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
231 mem[len] = 0;
232
233 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
234
235 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
236 SCM_SET_CELL_WORD_1 (buf, mem);
237 SCM_SET_CELL_WORD_2 (buf, len);
238 }
3ee86942 239}
bd9e24b3 240
9de87eea 241scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 242
3ee86942
MV
243/* Copy-on-write strings.
244 */
bd9e24b3 245
3ee86942 246#define STRING_TAG scm_tc7_string
bd9e24b3 247
3ee86942
MV
248#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
249#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
250#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 251
3ee86942
MV
252#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
253#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
254
255#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
256
ed35de72
MV
257/* Read-only strings.
258 */
259
260#define RO_STRING_TAG (scm_tc7_string + 0x200)
261#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
262
e1b29f6a
MV
263/* Mutation-sharing substrings
264 */
265
266#define SH_STRING_TAG (scm_tc7_string + 0x100)
267
268#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
269/* START and LENGTH as for STRINGs. */
270
271#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
272
50b1996f
MG
273/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
274 characters. CHARSP, if not NULL, will be set to location of the
275 char array. */
3ee86942
MV
276SCM
277scm_i_make_string (size_t len, char **charsp)
278{
279 SCM buf = make_stringbuf (len);
280 SCM res;
281 if (charsp)
282 *charsp = STRINGBUF_CHARS (buf);
283 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
284 (scm_t_bits)0, (scm_t_bits) len);
285 return res;
0f2d19dd
JB
286}
287
50b1996f
MG
288/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
289 characters. CHARSP, if not NULL, will be set to location of the
290 character array. */
9c44cd45 291SCM
50b1996f 292scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
293{
294 SCM buf = make_wide_stringbuf (len);
295 SCM res;
296 if (charsp)
297 *charsp = STRINGBUF_WIDE_CHARS (buf);
298 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
299 (scm_t_bits) 0, (scm_t_bits) len);
300 return res;
301}
302
3ee86942
MV
303static void
304validate_substring_args (SCM str, size_t start, size_t end)
305{
306 if (!IS_STRING (str))
307 scm_wrong_type_arg_msg (NULL, 0, str, "string");
308 if (start > STRING_LENGTH (str))
309 scm_out_of_range (NULL, scm_from_size_t (start));
310 if (end > STRING_LENGTH (str) || end < start)
311 scm_out_of_range (NULL, scm_from_size_t (end));
312}
0f2d19dd 313
e1b29f6a
MV
314static inline void
315get_str_buf_start (SCM *str, SCM *buf, size_t *start)
316{
317 *start = STRING_START (*str);
318 if (IS_SH_STRING (*str))
319 {
320 *str = SH_STRING_STRING (*str);
321 *start += STRING_START (*str);
322 }
323 *buf = STRING_STRINGBUF (*str);
324}
325
3ee86942
MV
326SCM
327scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 328{
e1b29f6a
MV
329 SCM buf;
330 size_t str_start;
331 get_str_buf_start (&str, &buf, &str_start);
9de87eea 332 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 333 SET_STRINGBUF_SHARED (buf);
9de87eea 334 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 335 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
336 (scm_t_bits)str_start + start,
337 (scm_t_bits) end - start);
0f2d19dd
JB
338}
339
ed35de72
MV
340SCM
341scm_i_substring_read_only (SCM str, size_t start, size_t end)
342{
45a9f430
LC
343 SCM buf;
344 size_t str_start;
345 get_str_buf_start (&str, &buf, &str_start);
346 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
347 SET_STRINGBUF_SHARED (buf);
348 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
349 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
350 (scm_t_bits)str_start + start,
351 (scm_t_bits) end - start);
ed35de72
MV
352}
353
3ee86942
MV
354SCM
355scm_i_substring_copy (SCM str, size_t start, size_t end)
356{
357 size_t len = end - start;
edea856c 358 SCM buf, my_buf;
e1b29f6a
MV
359 size_t str_start;
360 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
361 if (scm_i_is_narrow_string (str))
362 {
363 my_buf = make_stringbuf (len);
364 memcpy (STRINGBUF_CHARS (my_buf),
365 STRINGBUF_CHARS (buf) + str_start + start, len);
366 }
367 else
368 {
369 my_buf = make_wide_stringbuf (len);
370 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
371 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
372 + start), len);
373 /* Even though this string is wide, the substring may be narrow.
50b1996f 374 Consider adding code to narrow the string. */
9c44cd45 375 }
3ee86942 376 scm_remember_upto_here_1 (buf);
9c44cd45
MG
377 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
378 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 379}
0f2d19dd 380
e1b29f6a
MV
381SCM
382scm_i_substring_shared (SCM str, size_t start, size_t end)
383{
384 if (start == 0 && end == STRING_LENGTH (str))
385 return str;
386 else
387 {
388 size_t len = end - start;
389 if (IS_SH_STRING (str))
390 {
391 start += STRING_START (str);
392 str = SH_STRING_STRING (str);
393 }
394 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
395 (scm_t_bits)start, (scm_t_bits) len);
396 }
397}
398
3ee86942
MV
399SCM
400scm_c_substring (SCM str, size_t start, size_t end)
401{
402 validate_substring_args (str, start, end);
403 return scm_i_substring (str, start, end);
404}
ee149d03 405
ed35de72
MV
406SCM
407scm_c_substring_read_only (SCM str, size_t start, size_t end)
408{
409 validate_substring_args (str, start, end);
410 return scm_i_substring_read_only (str, start, end);
411}
412
0f2d19dd 413SCM
3ee86942 414scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 415{
3ee86942
MV
416 validate_substring_args (str, start, end);
417 return scm_i_substring_copy (str, start, end);
418}
419
3ee86942
MV
420SCM
421scm_c_substring_shared (SCM str, size_t start, size_t end)
422{
423 validate_substring_args (str, start, end);
424 return scm_i_substring_shared (str, start, end);
425}
0f2d19dd 426
ee149d03 427SCM
3ee86942 428scm_i_string_mark (SCM str)
ee149d03 429{
3ee86942
MV
430 if (IS_SH_STRING (str))
431 return SH_STRING_STRING (str);
432 else
433 return STRING_STRINGBUF (str);
ee149d03
JB
434}
435
3ee86942
MV
436void
437scm_i_string_free (SCM str)
438{
439}
36284627 440
3ee86942
MV
441/* Internal accessors
442 */
443
50b1996f
MG
444/* Returns the number of characters in STR. This may be different
445 than the memory size of the string storage. */
3ee86942
MV
446size_t
447scm_i_string_length (SCM str)
0f2d19dd 448{
3ee86942 449 return STRING_LENGTH (str);
0f2d19dd
JB
450}
451
50b1996f
MG
452/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
453 encoding. False if it is 'wide', having a 32-bit UCS-4
454 encoding. */
9c44cd45
MG
455int
456scm_i_is_narrow_string (SCM str)
457{
458 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
459}
460
50b1996f
MG
461/* Returns a pointer to the 8-bit Latin-1 encoded character array of
462 STR. */
3ee86942
MV
463const char *
464scm_i_string_chars (SCM str)
465{
466 SCM buf;
e1b29f6a
MV
467 size_t start;
468 get_str_buf_start (&str, &buf, &start);
9c44cd45
MG
469 if (scm_i_is_narrow_string (str))
470 return STRINGBUF_CHARS (buf) + start;
471 else
472 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
473 scm_list_1 (str));
474 return NULL;
3ee86942 475}
b00418df 476
50b1996f
MG
477/* Returns a pointer to the 32-bit UCS-4 encoded character array of
478 STR. */
9c44cd45
MG
479const scm_t_wchar *
480scm_i_string_wide_chars (SCM str)
481{
482 SCM buf;
483 size_t start;
484
485 get_str_buf_start (&str, &buf, &start);
486 if (!scm_i_is_narrow_string (str))
487 return STRINGBUF_WIDE_CHARS (buf) + start;
488 else
489 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
490 scm_list_1 (str));
491}
492
493/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
494 a new string buffer, so that it can be modified without modifying
50b1996f
MG
495 other strings. Also, lock the string mutex. Later, one must call
496 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
497SCM
498scm_i_string_start_writing (SCM orig_str)
b00418df 499{
ed35de72 500 SCM buf, str = orig_str;
e1b29f6a 501 size_t start;
ed35de72 502
e1b29f6a 503 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
504 if (IS_RO_STRING (str))
505 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
506
9de87eea 507 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
508 if (STRINGBUF_SHARED (buf))
509 {
9c44cd45 510 /* Clone the stringbuf. */
3ee86942
MV
511 size_t len = STRING_LENGTH (str);
512 SCM new_buf;
513
9de87eea 514 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 515
9c44cd45
MG
516 if (scm_i_is_narrow_string (str))
517 {
518 new_buf = make_stringbuf (len);
519 memcpy (STRINGBUF_CHARS (new_buf),
520 STRINGBUF_CHARS (buf) + STRING_START (str), len);
521
522 }
523 else
524 {
525 new_buf = make_wide_stringbuf (len);
526 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
527 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
528 + STRING_START (str)), len);
529 }
3ee86942
MV
530 scm_i_thread_put_to_sleep ();
531 SET_STRING_STRINGBUF (str, new_buf);
532 start -= STRING_START (str);
533 SET_STRING_START (str, 0);
534 scm_i_thread_wake_up ();
535
536 buf = new_buf;
537
9de87eea 538 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 539 }
9c44cd45
MG
540 return orig_str;
541}
542
50b1996f 543/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
544char *
545scm_i_string_writable_chars (SCM str)
546{
547 SCM buf;
548 size_t start;
3ee86942 549
9c44cd45
MG
550 get_str_buf_start (&str, &buf, &start);
551 if (scm_i_is_narrow_string (str))
552 return STRINGBUF_CHARS (buf) + start;
553 else
554 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
555 scm_list_1 (str));
556 return NULL;
557}
558
50b1996f 559/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
560static scm_t_wchar *
561scm_i_string_writable_wide_chars (SCM str)
562{
563 SCM buf;
564 size_t start;
565
566 get_str_buf_start (&str, &buf, &start);
567 if (!scm_i_is_narrow_string (str))
568 return STRINGBUF_WIDE_CHARS (buf) + start;
569 else
570 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
571 scm_list_1 (str));
b00418df
DH
572}
573
50b1996f
MG
574/* Unlock the string mutex that was locked when
575 scm_i_string_start_writing was called. */
3ee86942
MV
576void
577scm_i_string_stop_writing (void)
578{
9de87eea 579 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 580}
b00418df 581
50b1996f 582/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
583scm_t_wchar
584scm_i_string_ref (SCM str, size_t x)
585{
586 if (scm_i_is_narrow_string (str))
587 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
588 else
589 return scm_i_string_wide_chars (str)[x];
590}
591
50b1996f 592/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
593void
594scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
595{
596 if (chr > 0xFF && scm_i_is_narrow_string (str))
597 widen_stringbuf (STRING_STRINGBUF (str));
598
599 if (scm_i_is_narrow_string (str))
600 {
601 char *dst = scm_i_string_writable_chars (str);
602 dst[p] = (char) (unsigned char) chr;
603 }
604 else
605 {
606 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
607 dst[p] = chr;
608 }
609}
610
3ee86942
MV
611/* Symbols.
612
613 Basic symbol creation and accessing is done here, the rest is in
614 symbols.[hc]. This has been done to keep stringbufs and the
615 internals of strings and string-like objects confined to this file.
616*/
617
618#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
619
620SCM
6869328b
MV
621scm_i_make_symbol (SCM name, scm_t_bits flags,
622 unsigned long hash, SCM props)
3ee86942
MV
623{
624 SCM buf;
625 size_t start = STRING_START (name);
626 size_t length = STRING_LENGTH (name);
627
628 if (IS_SH_STRING (name))
629 {
630 name = SH_STRING_STRING (name);
631 start += STRING_START (name);
632 }
633 buf = SYMBOL_STRINGBUF (name);
634
635 if (start == 0 && length == STRINGBUF_LENGTH (buf))
636 {
637 /* reuse buf. */
9de87eea 638 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 639 SET_STRINGBUF_SHARED (buf);
9de87eea 640 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
641 }
642 else
643 {
644 /* make new buf. */
9c44cd45
MG
645 if (scm_i_is_narrow_string (name))
646 {
647 SCM new_buf = make_stringbuf (length);
648 memcpy (STRINGBUF_CHARS (new_buf),
649 STRINGBUF_CHARS (buf) + start, length);
650 buf = new_buf;
651 }
652 else
653 {
654 SCM new_buf = make_wide_stringbuf (length);
655 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
656 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
657 length);
658 buf = new_buf;
659 }
3ee86942 660 }
6869328b 661 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
662 (scm_t_bits) hash, SCM_UNPACK (props));
663}
664
fd0a5bbc
HWN
665SCM
666scm_i_c_make_symbol (const char *name, size_t len,
667 scm_t_bits flags, unsigned long hash, SCM props)
668{
669 SCM buf = make_stringbuf (len);
670 memcpy (STRINGBUF_CHARS (buf), name, len);
671
672 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
673 (scm_t_bits) hash, SCM_UNPACK (props));
674}
675
676/* Return a new symbol that uses the LEN bytes pointed to by NAME as its
677 underlying storage. */
678SCM
679scm_i_c_take_symbol (char *name, size_t len,
680 scm_t_bits flags, unsigned long hash, SCM props)
681{
682 SCM buf = scm_i_take_stringbufn (name, len);
683
684 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
685 (scm_t_bits) hash, SCM_UNPACK (props));
686}
687
50b1996f
MG
688/* Returns the number of characters in SYM. This may be different
689 from the memory size of SYM. */
3ee86942
MV
690size_t
691scm_i_symbol_length (SCM sym)
0f2d19dd 692{
3ee86942 693 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
694}
695
071bb6a8
LC
696size_t
697scm_c_symbol_length (SCM sym)
698#define FUNC_NAME "scm_c_symbol_length"
699{
700 SCM_VALIDATE_SYMBOL (1, sym);
701
702 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
703}
704#undef FUNC_NAME
705
50b1996f
MG
706/* True if the name of SYM is stored as a Latin-1 encoded string.
707 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
708int
709scm_i_is_narrow_symbol (SCM sym)
710{
711 SCM buf;
712
713 buf = SYMBOL_STRINGBUF (sym);
714 return !STRINGBUF_WIDE (buf);
715}
716
50b1996f
MG
717/* Returns a pointer to the 8-bit Latin-1 encoded character array that
718 contains the name of SYM. */
3ee86942
MV
719const char *
720scm_i_symbol_chars (SCM sym)
721{
9c44cd45
MG
722 SCM buf;
723
724 buf = SYMBOL_STRINGBUF (sym);
725 if (!STRINGBUF_WIDE (buf))
726 return STRINGBUF_CHARS (buf);
727 else
728 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
729 scm_list_1 (sym));
730}
731
50b1996f
MG
732/* Return a pointer to the 32-bit UCS-4-encoded character array of a
733 symbol's name. */
9c44cd45
MG
734const scm_t_wchar *
735scm_i_symbol_wide_chars (SCM sym)
736{
737 SCM buf;
738
739 buf = SYMBOL_STRINGBUF (sym);
740 if (STRINGBUF_WIDE (buf))
741 return STRINGBUF_WIDE_CHARS (buf);
742 else
743 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
744 scm_list_1 (sym));
3ee86942 745}
1cc91f1b 746
3ee86942
MV
747SCM
748scm_i_symbol_mark (SCM sym)
0f2d19dd 749{
3ee86942
MV
750 scm_gc_mark (SYMBOL_STRINGBUF (sym));
751 return SCM_CELL_OBJECT_3 (sym);
0f2d19dd
JB
752}
753
3ee86942
MV
754void
755scm_i_symbol_free (SCM sym)
756{
757}
0f2d19dd 758
be54b15d 759SCM
3ee86942 760scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 761{
3ee86942 762 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 763 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 764 SET_STRINGBUF_SHARED (buf);
9de87eea 765 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 766 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
767 (scm_t_bits)start, (scm_t_bits) end - start);
768}
be54b15d 769
50b1996f 770/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
771scm_t_wchar
772scm_i_symbol_ref (SCM sym, size_t x)
773{
774 if (scm_i_is_narrow_symbol (sym))
775 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
776 else
777 return scm_i_symbol_wide_chars (sym)[x];
778}
779
3ee86942
MV
780/* Debugging
781 */
be54b15d 782
6ce6923b
MG
783SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
784 "Returns an association list containing debugging information\n"
785 "for @var{str}. The association list has the following entries."
786 "@table @code\n"
787 "@item string\n"
788 "The string itself.\n"
789 "@item start\n"
790 "The start index of the string into its stringbuf\n"
791 "@item length\n"
792 "The length of the string\n"
793 "@item shared\n"
794 "If this string is a substring, it returns its parent string.\n"
795 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
796 "@item read-only\n"
797 "@code{#t} if the string is read-only\n"
6ce6923b
MG
798 "@item stringbuf-chars\n"
799 "A new string containing this string's stringbuf's characters\n"
800 "@item stringbuf-length\n"
801 "The number of characters in this stringbuf\n"
802 "@item stringbuf-shared\n"
803 "@code{#t} if this stringbuf is shared\n"
804 "@item stringbuf-inline\n"
805 "@code{#t} if this stringbuf's characters are stored in the\n"
806 "cell itself, or @code{#f} if they were allocated in memory\n"
807 "@item stringbuf-wide\n"
808 "@code{#t} if this stringbuf's characters are stored in a\n"
809 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
810 "buffer\n"
811 "@end table")
3ee86942
MV
812#define FUNC_NAME s_scm_sys_string_dump
813{
6ce6923b
MG
814 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
815 SCM buf;
3ee86942 816 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
817
818 /* String info */
819 e1 = scm_cons (scm_from_locale_symbol ("string"),
820 str);
821 e2 = scm_cons (scm_from_locale_symbol ("start"),
822 scm_from_size_t (STRING_START (str)));
823 e3 = scm_cons (scm_from_locale_symbol ("length"),
824 scm_from_size_t (STRING_LENGTH (str)));
825
3ee86942
MV
826 if (IS_SH_STRING (str))
827 {
6ce6923b
MG
828 e4 = scm_cons (scm_from_locale_symbol ("shared"),
829 SH_STRING_STRING (str));
830 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
831 }
832 else
833 {
6ce6923b
MG
834 e4 = scm_cons (scm_from_locale_symbol ("shared"),
835 SCM_BOOL_F);
836 buf = STRING_STRINGBUF (str);
837 }
9c44cd45 838
88ed5759
MG
839 if (IS_RO_STRING (str))
840 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
841 SCM_BOOL_T);
842 else
843 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
844 SCM_BOOL_F);
845
6ce6923b 846 /* Stringbuf info */
6ce6923b
MG
847 if (!STRINGBUF_WIDE (buf))
848 {
849 size_t len = STRINGBUF_LENGTH (buf);
850 char *cbuf;
851 SCM sbc = scm_i_make_string (len, &cbuf);
852 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
853 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
854 sbc);
3ee86942 855 }
6ce6923b
MG
856 else
857 {
858 size_t len = STRINGBUF_LENGTH (buf);
859 scm_t_wchar *cbuf;
860 SCM sbc = scm_i_make_wide_string (len, &cbuf);
861 u32_cpy ((scm_t_uint32 *) cbuf,
862 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
863 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
864 sbc);
865 }
866 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
867 scm_from_size_t (STRINGBUF_LENGTH (buf)));
868 if (STRINGBUF_SHARED (buf))
869 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
870 SCM_BOOL_T);
871 else
872 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
873 SCM_BOOL_F);
874 if (STRINGBUF_INLINE (buf))
875 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
876 SCM_BOOL_T);
877 else
878 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
879 SCM_BOOL_F);
880 if (STRINGBUF_WIDE (buf))
881 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
882 SCM_BOOL_T);
883 else
884 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
885 SCM_BOOL_F);
886
887 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
3ee86942
MV
888}
889#undef FUNC_NAME
890
6ce6923b
MG
891SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
892 "Returns an association list containing debugging information\n"
893 "for @var{sym}. The association list has the following entries."
894 "@table @code\n"
895 "@item symbol\n"
896 "The symbol itself\n"
897 "@item hash\n"
898 "Its hash value\n"
88ed5759
MG
899 "@item interned\n"
900 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
901 "@item stringbuf-chars\n"
902 "A new string containing this symbols's stringbuf's characters\n"
903 "@item stringbuf-length\n"
904 "The number of characters in this stringbuf\n"
905 "@item stringbuf-shared\n"
906 "@code{#t} if this stringbuf is shared\n"
907 "@item stringbuf-inline\n"
908 "@code{#t} if this stringbuf's characters are stored in the\n"
909 "cell itself, or @code{#f} if they were allocated in memory\n"
910 "@item stringbuf-wide\n"
911 "@code{#t} if this stringbuf's characters are stored in a\n"
912 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
913 "buffer\n"
914 "@end table")
3ee86942
MV
915#define FUNC_NAME s_scm_sys_symbol_dump
916{
6ce6923b
MG
917 SCM e1, e2, e3, e4, e5, e6, e7, e8;
918 SCM buf;
3ee86942 919 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
920 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
921 sym);
922 e2 = scm_cons (scm_from_locale_symbol ("hash"),
923 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
924 e3 = scm_cons (scm_from_locale_symbol ("interned"),
925 scm_symbol_interned_p (sym));
6ce6923b
MG
926 buf = SYMBOL_STRINGBUF (sym);
927
928 /* Stringbuf info */
6ce6923b
MG
929 if (!STRINGBUF_WIDE (buf))
930 {
931 size_t len = STRINGBUF_LENGTH (buf);
932 char *cbuf;
933 SCM sbc = scm_i_make_string (len, &cbuf);
934 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
935 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
936 sbc);
937 }
9c44cd45 938 else
6ce6923b
MG
939 {
940 size_t len = STRINGBUF_LENGTH (buf);
941 scm_t_wchar *cbuf;
942 SCM sbc = scm_i_make_wide_string (len, &cbuf);
943 u32_cpy ((scm_t_uint32 *) cbuf,
944 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
945 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
946 sbc);
947 }
948 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
949 scm_from_size_t (STRINGBUF_LENGTH (buf)));
950 if (STRINGBUF_SHARED (buf))
951 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
952 SCM_BOOL_T);
953 else
954 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
955 SCM_BOOL_F);
956 if (STRINGBUF_INLINE (buf))
957 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
958 SCM_BOOL_T);
959 else
960 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
961 SCM_BOOL_F);
962 if (STRINGBUF_WIDE (buf))
963 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
964 SCM_BOOL_T);
965 else
966 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
967 SCM_BOOL_F);
968 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
969
3ee86942
MV
970}
971#undef FUNC_NAME
972
6ce6923b
MG
973#if SCM_STRING_LENGTH_HISTOGRAM
974
9c44cd45 975SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 976#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
977{
978 int i;
979 for (i = 0; i < 1000; i++)
980 if (lenhist[i])
981 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
982 fprintf (stderr, ">999: %u\n", lenhist[1000]);
983 return SCM_UNSPECIFIED;
be54b15d
DH
984}
985#undef FUNC_NAME
986
3ee86942
MV
987#endif
988
989\f
990
991SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
992 (SCM obj),
993 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
994#define FUNC_NAME s_scm_string_p
995{
996 return scm_from_bool (IS_STRING (obj));
997}
998#undef FUNC_NAME
999
1000
1001SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1002
1003SCM_DEFINE (scm_string, "string", 0, 0, 1,
1004 (SCM chrs),
1005 "@deffnx {Scheme Procedure} list->string chrs\n"
1006 "Return a newly allocated string composed of the arguments,\n"
1007 "@var{chrs}.")
1008#define FUNC_NAME s_scm_string
1009{
1010 SCM result;
9c44cd45 1011 SCM rest;
3ee86942 1012 size_t len;
9c44cd45
MG
1013 size_t p = 0;
1014 long i;
3ee86942 1015
9c44cd45
MG
1016 /* Verify that this is a list of chars. */
1017 i = scm_ilength (chrs);
3c7cf7f5
MG
1018 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1019
9c44cd45
MG
1020 len = (size_t) i;
1021 rest = chrs;
3ee86942 1022
9c44cd45 1023 while (len > 0 && scm_is_pair (rest))
3ee86942 1024 {
9c44cd45 1025 SCM elt = SCM_CAR (rest);
3ee86942 1026 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9c44cd45
MG
1027 rest = SCM_CDR (rest);
1028 len--;
1029 scm_remember_upto_here_1 (elt);
1030 }
1031
1032 /* Construct a string containing this list of chars. */
1033 len = (size_t) i;
1034 rest = chrs;
1035
1036 result = scm_i_make_string (len, NULL);
1037 result = scm_i_string_start_writing (result);
1038 while (len > 0 && scm_is_pair (rest))
1039 {
1040 SCM elt = SCM_CAR (rest);
1041 scm_i_string_set_x (result, p, SCM_CHAR (elt));
1042 p++;
1043 rest = SCM_CDR (rest);
3ee86942 1044 len--;
9c44cd45 1045 scm_remember_upto_here_1 (elt);
3ee86942 1046 }
9c44cd45
MG
1047 scm_i_string_stop_writing ();
1048
3ee86942
MV
1049 if (len > 0)
1050 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1051 if (!scm_is_null (rest))
3ee86942
MV
1052 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1053
1054 return result;
1055}
1056#undef FUNC_NAME
be54b15d 1057
3b3b36dd 1058SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1059 (SCM k, SCM chr),
0d26a824
MG
1060 "Return a newly allocated string of\n"
1061 "length @var{k}. If @var{chr} is given, then all elements of\n"
1062 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1063 "of the @var{string} are unspecified.")
1bbd0b84 1064#define FUNC_NAME s_scm_make_string
0f2d19dd 1065{
3ee86942
MV
1066 return scm_c_make_string (scm_to_size_t (k), chr);
1067}
1068#undef FUNC_NAME
1069
1070SCM
1071scm_c_make_string (size_t len, SCM chr)
1072#define FUNC_NAME NULL
1073{
9c44cd45
MG
1074 size_t p;
1075 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1076
e11e83f3
MV
1077 if (!SCM_UNBNDP (chr))
1078 {
3ee86942 1079 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1080 res = scm_i_string_start_writing (res);
1081 for (p = 0; p < len; p++)
1082 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1083 scm_i_string_stop_writing ();
0f2d19dd 1084 }
e11e83f3
MV
1085
1086 return res;
0f2d19dd 1087}
1bbd0b84 1088#undef FUNC_NAME
0f2d19dd 1089
3b3b36dd 1090SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1091 (SCM string),
1092 "Return the number of characters in @var{string}.")
1bbd0b84 1093#define FUNC_NAME s_scm_string_length
0f2d19dd 1094{
d1ca2c64 1095 SCM_VALIDATE_STRING (1, string);
3ee86942 1096 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1097}
1bbd0b84 1098#undef FUNC_NAME
0f2d19dd 1099
9c44cd45
MG
1100SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
1101 (SCM string),
1102 "Return the bytes used to represent a character in @var{string}."
1103 "This will return 1 or 4.")
1104#define FUNC_NAME s_scm_string_width
1105{
1106 SCM_VALIDATE_STRING (1, string);
1107 if (!scm_i_is_narrow_string (string))
1108 return scm_from_int (4);
1109
1110 return scm_from_int (1);
1111}
1112#undef FUNC_NAME
1113
3ee86942
MV
1114size_t
1115scm_c_string_length (SCM string)
1116{
1117 if (!IS_STRING (string))
1118 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1119 return STRING_LENGTH (string);
1120}
1121
bd9e24b3 1122SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1123 (SCM str, SCM k),
9c44cd45
MG
1124 "Return character @var{k} of @var{str} using zero-origin\n"
1125 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1126#define FUNC_NAME s_scm_string_ref
0f2d19dd 1127{
3ae3166b 1128 size_t len;
a55c2b68 1129 unsigned long idx;
bd9e24b3 1130
d1ca2c64 1131 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1132
1133 len = scm_i_string_length (str);
1134 if (SCM_LIKELY (len > 0))
1135 idx = scm_to_unsigned_integer (k, 0, len - 1);
1136 else
1137 scm_out_of_range (NULL, k);
1138
9c44cd45
MG
1139 if (scm_i_is_narrow_string (str))
1140 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1141 else
1142 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1143}
1bbd0b84 1144#undef FUNC_NAME
0f2d19dd 1145
3ee86942
MV
1146SCM
1147scm_c_string_ref (SCM str, size_t p)
1148{
1149 if (p >= scm_i_string_length (str))
1150 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1151 if (scm_i_is_narrow_string (str))
1152 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1153 else
1154 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1155
3ee86942 1156}
f0942910 1157
3b3b36dd 1158SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1159 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1160 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1161 "an unspecified value. @var{k} must be a valid index of\n"
1162 "@var{str}.")
1bbd0b84 1163#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1164{
3ae3166b 1165 size_t len;
a55c2b68
MV
1166 unsigned long idx;
1167
f0942910 1168 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1169
1170 len = scm_i_string_length (str);
1171 if (SCM_LIKELY (len > 0))
1172 idx = scm_to_unsigned_integer (k, 0, len - 1);
1173 else
1174 scm_out_of_range (NULL, k);
1175
34d19ef6 1176 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1177 str = scm_i_string_start_writing (str);
1178 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1179 scm_i_string_stop_writing ();
1180
0f2d19dd
JB
1181 return SCM_UNSPECIFIED;
1182}
1bbd0b84 1183#undef FUNC_NAME
0f2d19dd 1184
3ee86942
MV
1185void
1186scm_c_string_set_x (SCM str, size_t p, SCM chr)
1187{
1188 if (p >= scm_i_string_length (str))
1189 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1190 str = scm_i_string_start_writing (str);
1191 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1192 scm_i_string_stop_writing ();
3ee86942 1193}
0f2d19dd 1194
3b3b36dd 1195SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1196 (SCM str, SCM start, SCM end),
1197 "Return a newly allocated string formed from the characters\n"
1198 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1199 "ending with index @var{end} (exclusive).\n"
1200 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1201 "exact integers satisfying:\n\n"
1202 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1203#define FUNC_NAME s_scm_substring
0f2d19dd 1204{
3ee86942 1205 size_t len, from, to;
685c0d71 1206
d1ca2c64 1207 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1208 len = scm_i_string_length (str);
1209 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1210 if (SCM_UNBNDP (end))
3ee86942 1211 to = len;
a55c2b68 1212 else
3ee86942
MV
1213 to = scm_to_unsigned_integer (end, from, len);
1214 return scm_i_substring (str, from, to);
0f2d19dd 1215}
1bbd0b84 1216#undef FUNC_NAME
0f2d19dd 1217
ed35de72
MV
1218SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1219 (SCM str, SCM start, SCM end),
1220 "Return a newly allocated string formed from the characters\n"
1221 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1222 "ending with index @var{end} (exclusive).\n"
1223 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1224 "exact integers satisfying:\n"
1225 "\n"
1226 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1227 "\n"
1228 "The returned string is read-only.\n")
1229#define FUNC_NAME s_scm_substring_read_only
1230{
1231 size_t len, from, to;
1232
1233 SCM_VALIDATE_STRING (1, str);
1234 len = scm_i_string_length (str);
1235 from = scm_to_unsigned_integer (start, 0, len);
1236 if (SCM_UNBNDP (end))
1237 to = len;
1238 else
1239 to = scm_to_unsigned_integer (end, from, len);
1240 return scm_i_substring_read_only (str, from, to);
1241}
1242#undef FUNC_NAME
1243
3ee86942
MV
1244SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1245 (SCM str, SCM start, SCM end),
1246 "Return a newly allocated string formed from the characters\n"
1247 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1248 "ending with index @var{end} (exclusive).\n"
1249 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1250 "exact integers satisfying:\n\n"
1251 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1252#define FUNC_NAME s_scm_substring_copy
1253{
e1b29f6a
MV
1254 /* For the Scheme version, START is mandatory, but for the C
1255 version, it is optional. See scm_string_copy in srfi-13.c for a
1256 rationale.
1257 */
1258
1259 size_t from, to;
3ee86942
MV
1260
1261 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1262 scm_i_get_substring_spec (scm_i_string_length (str),
1263 start, &from, end, &to);
3ee86942
MV
1264 return scm_i_substring_copy (str, from, to);
1265}
1266#undef FUNC_NAME
1267
1268SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1269 (SCM str, SCM start, SCM end),
1270 "Return string that indirectly refers to the characters\n"
1271 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1272 "ending with index @var{end} (exclusive).\n"
1273 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1274 "exact integers satisfying:\n\n"
1275 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1276#define FUNC_NAME s_scm_substring_shared
1277{
1278 size_t len, from, to;
1279
1280 SCM_VALIDATE_STRING (1, str);
1281 len = scm_i_string_length (str);
1282 from = scm_to_unsigned_integer (start, 0, len);
1283 if (SCM_UNBNDP (end))
1284 to = len;
1285 else
1286 to = scm_to_unsigned_integer (end, from, len);
1287 return scm_i_substring_shared (str, from, to);
1288}
1289#undef FUNC_NAME
685c0d71 1290
3b3b36dd 1291SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1292 (SCM args),
9c44cd45 1293 "Return a newly allocated string whose characters form the\n"
0d26a824 1294 "concatenation of the given strings, @var{args}.")
1bbd0b84 1295#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1296{
1297 SCM res;
9c44cd45
MG
1298 size_t len = 0;
1299 int wide = 0;
c829a427 1300 SCM l, s;
bd4911ef 1301 size_t i;
9909c395
MG
1302 union
1303 {
1304 char *narrow;
1305 scm_t_wchar *wide;
1306 } data;
af45e3b0
DH
1307
1308 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1309 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1310 {
1311 s = SCM_CAR (l);
1312 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1313 len += scm_i_string_length (s);
1314 if (!scm_i_is_narrow_string (s))
1315 wide = 1;
c829a427 1316 }
9909c395 1317 data.narrow = NULL;
9c44cd45 1318 if (!wide)
9909c395 1319 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1320 else
9909c395 1321 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1322
1323 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1324 {
edea856c 1325 size_t len;
c829a427 1326 s = SCM_CAR (l);
3ee86942 1327 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1328 len = scm_i_string_length (s);
9c44cd45
MG
1329 if (!wide)
1330 {
9909c395
MG
1331 memcpy (data.narrow, scm_i_string_chars (s), len);
1332 data.narrow += len;
9c44cd45
MG
1333 }
1334 else
1335 {
1336 if (scm_i_is_narrow_string (s))
1337 {
1338 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1339 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1340 }
1341 else
9909c395 1342 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1343 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1344 data.wide += len;
9c44cd45 1345 }
c829a427
MV
1346 scm_remember_upto_here_1 (s);
1347 }
0f2d19dd
JB
1348 return res;
1349}
1bbd0b84 1350#undef FUNC_NAME
0f2d19dd 1351
c829a427
MV
1352int
1353scm_is_string (SCM obj)
1354{
3ee86942 1355 return IS_STRING (obj);
c829a427 1356}
24933780 1357
c829a427
MV
1358SCM
1359scm_from_locale_stringn (const char *str, size_t len)
1360{
1361 SCM res;
1362 char *dst;
4d4528e7 1363
9c44cd45 1364 if (len == (size_t) -1)
c829a427 1365 len = strlen (str);
9c44cd45
MG
1366 if (len == 0)
1367 return scm_nullstr;
1368
3ee86942 1369 res = scm_i_make_string (len, &dst);
c829a427
MV
1370 memcpy (dst, str, len);
1371 return res;
1372}
4d4528e7 1373
c829a427
MV
1374SCM
1375scm_from_locale_string (const char *str)
4d4528e7 1376{
9c44cd45
MG
1377 if (str == NULL)
1378 return scm_nullstr;
1379
c829a427
MV
1380 return scm_from_locale_stringn (str, -1);
1381}
4d4528e7 1382
50b1996f
MG
1383/* Create a new scheme string from the C string STR. The memory of
1384 STR may be used directly as storage for the new string. */
c829a427
MV
1385SCM
1386scm_take_locale_stringn (char *str, size_t len)
1387{
48ddf0d9
KR
1388 SCM buf, res;
1389
9c44cd45 1390 if (len == (size_t) -1)
48ddf0d9 1391 len = strlen (str);
c829a427
MV
1392 else
1393 {
48ddf0d9
KR
1394 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1395 often be satisfied from the alignment padding after the block, with
1396 no actual data movement. */
9c44cd45 1397 str = scm_realloc (str, len + 1);
48ddf0d9 1398 str[len] = '\0';
c829a427 1399 }
c829a427 1400
fd0a5bbc 1401 buf = scm_i_take_stringbufn (str, len);
3ee86942 1402 res = scm_double_cell (STRING_TAG,
9c44cd45 1403 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
c829a427
MV
1404 return res;
1405}
1406
48ddf0d9
KR
1407SCM
1408scm_take_locale_string (char *str)
1409{
1410 return scm_take_locale_stringn (str, -1);
1411}
1412
9c44cd45
MG
1413/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1414 and \UXXXXXX. */
1415static void
1416unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1417{
1418 char *before, *after;
1419 size_t i, j;
1420
1421 before = *bufp;
1422 after = *bufp;
1423 i = 0;
1424 j = 0;
1425 while (i < *lenp)
1426 {
1427 if ((i <= *lenp - 6)
1428 && before[i] == '\\'
1429 && before[i + 1] == 'u'
1430 && before[i + 2] == '0' && before[i + 3] == '0')
1431 {
1432 /* Convert \u00NN to \xNN */
1433 after[j] = '\\';
1434 after[j + 1] = 'x';
30a6b9ca
MG
1435 after[j + 2] = tolower ((int) before[i + 4]);
1436 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1437 i += 6;
1438 j += 4;
1439 }
1440 else if ((i <= *lenp - 10)
1441 && before[i] == '\\'
1442 && before[i + 1] == 'U'
1443 && before[i + 2] == '0' && before[i + 3] == '0')
1444 {
1445 /* Convert \U00NNNNNN to \UNNNNNN */
1446 after[j] = '\\';
1447 after[j + 1] = 'U';
30a6b9ca
MG
1448 after[j + 2] = tolower ((int) before[i + 4]);
1449 after[j + 3] = tolower ((int) before[i + 5]);
1450 after[j + 4] = tolower ((int) before[i + 6]);
1451 after[j + 5] = tolower ((int) before[i + 7]);
1452 after[j + 6] = tolower ((int) before[i + 8]);
1453 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1454 i += 10;
1455 j += 8;
1456 }
1457 else
1458 {
1459 after[j] = before[i];
1460 i++;
1461 j++;
1462 }
1463 }
1464 *lenp = j;
1465 after = scm_realloc (after, j);
1466}
1467
c829a427 1468char *
9c44cd45 1469scm_to_locale_stringn (SCM str, size_t * lenp)
c829a427 1470{
9c44cd45
MG
1471 const char *enc;
1472
1473 /* In the future, enc will hold the port's encoding. */
1474 enc = NULL;
1475
1476 return scm_to_stringn (str, lenp, enc, iconveh_escape_sequence);
1477}
1478
1479/* Low-level scheme to C string conversion function. */
1480char *
1481scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
1482 enum iconv_ilseq_handler handler)
1483{
1484 static const char iso[11] = "ISO-8859-1";
1485 char *buf;
1486 size_t ilen, len, i;
4d4528e7 1487
3ee86942 1488 if (!scm_is_string (str))
c829a427 1489 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1490 ilen = scm_i_string_length (str);
1491
1492 if (ilen == 0)
1493 {
1494 buf = scm_malloc (1);
1495 buf[0] = '\0';
1496 if (lenp)
1497 *lenp = 0;
1498 return buf;
1499 }
1500
c829a427 1501 if (lenp == NULL)
9c44cd45
MG
1502 for (i = 0; i < ilen; i++)
1503 if (scm_i_string_ref (str, i) == '\0')
1504 scm_misc_error (NULL,
1505 "string contains #\\nul character: ~S",
1506 scm_list_1 (str));
1507
1508 if (scm_i_is_narrow_string (str))
c829a427 1509 {
9c44cd45
MG
1510 if (lenp)
1511 {
1512 buf = scm_malloc (ilen);
1513 memcpy (buf, scm_i_string_chars (str), ilen);
1514 *lenp = ilen;
1515 return buf;
1516 }
1517 else
1518 {
1519 buf = scm_malloc (ilen + 1);
1520 memcpy (buf, scm_i_string_chars (str), ilen);
1521 buf[ilen] = '\0';
1522 return buf;
1523 }
c829a427 1524 }
9c44cd45
MG
1525
1526
1527 buf = NULL;
1528 len = 0;
1529 buf = u32_conv_to_encoding (iso,
1530 handler,
1531 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1532 ilen, NULL, NULL, &len);
1533 if (buf == NULL)
1534 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1535 scm_list_2 (scm_from_locale_string (iso), str));
1536
1537 if (handler == iconveh_escape_sequence)
1538 unistring_escapes_to_guile_escapes (&buf, &len);
1539
1540 if (lenp)
4d4528e7 1541 *lenp = len;
9c44cd45
MG
1542 else
1543 {
1544 buf = scm_realloc (buf, len + 1);
1545 buf[len] = '\0';
1546 }
24933780 1547
c829a427 1548 scm_remember_upto_here_1 (str);
9c44cd45 1549 return buf;
4d4528e7 1550}
af68e5e5 1551
c829a427
MV
1552char *
1553scm_to_locale_string (SCM str)
1554{
1555 return scm_to_locale_stringn (str, NULL);
1556}
af68e5e5 1557
c829a427
MV
1558size_t
1559scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1560{
1561 size_t len;
9c44cd45 1562 char *result = NULL;
3ee86942 1563 if (!scm_is_string (str))
c829a427 1564 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1565 result = scm_to_locale_stringn (str, &len);
1566
1567 memcpy (buf, result, (len > max_len) ? max_len : len);
1568 free (result);
1569
c829a427
MV
1570 scm_remember_upto_here_1 (str);
1571 return len;
1572}
af68e5e5 1573
3ee86942
MV
1574/* converts C scm_array of strings to SCM scm_list of strings. */
1575/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1576SCM
3ee86942
MV
1577scm_makfromstrs (int argc, char **argv)
1578{
1579 int i = argc;
1580 SCM lst = SCM_EOL;
1581 if (0 > i)
1582 for (i = 0; argv[i]; i++);
1583 while (i--)
1584 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1585 return lst;
1586}
1587
c829a427
MV
1588/* Return a newly allocated array of char pointers to each of the strings
1589 in args, with a terminating NULL pointer. */
1590
1591char **
1592scm_i_allocate_string_pointers (SCM list)
af68e5e5 1593{
c829a427
MV
1594 char **result;
1595 int len = scm_ilength (list);
1596 int i;
1597
1598 if (len < 0)
1599 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1600
661ae7ab 1601 scm_dynwind_begin (0);
c829a427
MV
1602
1603 result = (char **) scm_malloc ((len + 1) * sizeof (char *));
1604 result[len] = NULL;
661ae7ab 1605 scm_dynwind_unwind_handler (free, result, 0);
c829a427
MV
1606
1607 /* The list might be have been modified in another thread, so
1608 we check LIST before each access.
1609 */
d2e53ed6 1610 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427
MV
1611 {
1612 result[i] = scm_to_locale_string (SCM_CAR (list));
1613 list = SCM_CDR (list);
1614 }
1615
661ae7ab 1616 scm_dynwind_end ();
c829a427 1617 return result;
af68e5e5 1618}
e53cc817 1619
c829a427
MV
1620void
1621scm_i_free_string_pointers (char **pointers)
1622{
1623 int i;
1624
1625 for (i = 0; pointers[i]; i++)
1626 free (pointers[i]);
1627 free (pointers);
1628}
24933780 1629
6f14f578
MV
1630void
1631scm_i_get_substring_spec (size_t len,
1632 SCM start, size_t *cstart,
1633 SCM end, size_t *cend)
1634{
1635 if (SCM_UNBNDP (start))
1636 *cstart = 0;
1637 else
1638 *cstart = scm_to_unsigned_integer (start, 0, len);
1639
1640 if (SCM_UNBNDP (end))
1641 *cend = len;
1642 else
1643 *cend = scm_to_unsigned_integer (end, *cstart, len);
1644}
1645
3ee86942
MV
1646#if SCM_ENABLE_DEPRECATED
1647
556d75db
MV
1648/* When these definitions are removed, it becomes reasonable to use
1649 read-only strings for string literals. For that, change the reader
1650 to create string literals with scm_c_substring_read_only instead of
1651 with scm_c_substring_copy.
1652*/
1653
3ee86942 1654int
fe78c51a 1655scm_i_deprecated_stringp (SCM str)
3ee86942
MV
1656{
1657 scm_c_issue_deprecation_warning
1658 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1659
2616f0e0 1660 return scm_is_string (str);
3ee86942
MV
1661}
1662
1663char *
fe78c51a 1664scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
1665{
1666 char *chars;
1667
1668 scm_c_issue_deprecation_warning
1669 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1670
2616f0e0
MV
1671 /* We don't accept shared substrings here since they are not
1672 null-terminated.
1673 */
1674 if (IS_SH_STRING (str))
1675 scm_misc_error (NULL,
1676 "SCM_STRING_CHARS does not work with shared substrings.",
1677 SCM_EOL);
1678
877f06c3 1679 /* We explicitly test for read-only strings to produce a better
556d75db
MV
1680 error message.
1681 */
1682
1683 if (IS_RO_STRING (str))
1684 scm_misc_error (NULL,
1685 "SCM_STRING_CHARS does not work with read-only strings.",
1686 SCM_EOL);
1687
2616f0e0 1688 /* The following is still wrong, of course...
3ee86942 1689 */
9c44cd45 1690 str = scm_i_string_start_writing (str);
3ee86942
MV
1691 chars = scm_i_string_writable_chars (str);
1692 scm_i_string_stop_writing ();
1693 return chars;
1694}
1695
1696size_t
fe78c51a 1697scm_i_deprecated_string_length (SCM str)
3ee86942
MV
1698{
1699 scm_c_issue_deprecation_warning
1700 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1701 return scm_c_string_length (str);
1702}
1703
1704#endif
1705
0f2d19dd
JB
1706void
1707scm_init_strings ()
0f2d19dd 1708{
3ee86942 1709 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 1710
a0599745 1711#include "libguile/strings.x"
0f2d19dd
JB
1712}
1713
89e00824
ML
1714
1715/*
1716 Local Variables:
1717 c-file-style: "gnu"
1718 End:
1719*/