procedure-documentation works on vm procedures
[bpt/guile.git] / libguile / strings.c
CommitLineData
50b1996f 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45
MG
27#include <ctype.h>
28#include <unistr.h>
eca29b02 29#include <uniconv.h>
faf2c9d7 30
a0599745
MD
31#include "libguile/_scm.h"
32#include "libguile/chars.h"
7c33806a 33#include "libguile/root.h"
a0599745 34#include "libguile/strings.h"
1afff620 35#include "libguile/deprecation.h"
a0599745 36#include "libguile/validate.h"
c829a427 37#include "libguile/dynwind.h"
1afff620 38
0f2d19dd
JB
39\f
40
41/* {Strings}
42 */
43
3ee86942
MV
44
45/* Stringbufs
46 *
47 * XXX - keeping an accurate refcount during GC seems to be quite
48 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 49 * shared, not whether it definitely is.
3ee86942
MV
50 *
51 * The scheme I (mvo) tried to keep an accurate reference count would
52 * recount all strings that point to a stringbuf during the mark-phase
53 * of the GC. This was done since one cannot access the stringbuf of
54 * a string when that string is freed (in order to decrease the
55 * reference count). The memory of the stringbuf might have been
56 * reused already for something completely different.
57 *
58 * This recounted worked for a small number of threads beating on
59 * cow-strings, but it failed randomly with more than 10 threads, say.
60 * I couldn't figure out what went wrong, so I used the conservative
61 * approach implemented below.
62 *
63 * A stringbuf needs to know its length, but only so that it can be
64 * reported when the stringbuf is freed.
65 *
50b1996f
MG
66 * There are 3 storage strategies for stringbufs: inline, outline, and
67 * wide.
68 *
69 * Inline strings are small 8-bit strings stored within the double
70 * cell itself. Outline strings are larger 8-bit strings with GC
71 * allocated storage. Wide strings are 32-bit strings with allocated
72 * storage.
73 *
74 * There was little value in making wide string inlineable, since
75 * there is only room for three inlined 32-bit characters. Thus wide
76 * stringbufs are never inlined.
3ee86942
MV
77 */
78
79#define STRINGBUF_F_SHARED 0x100
80#define STRINGBUF_F_INLINE 0x200
50b1996f
MG
81#define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
82 encoding. Otherwise, strings
83 are Latin-1. */
3ee86942
MV
84
85#define STRINGBUF_TAG scm_tc7_stringbuf
86#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
87#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
9c44cd45 88#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 89
f59cf998 90#define STRINGBUF_OUTLINE_CHARS(buf) ((unsigned char *) SCM_CELL_WORD_1(buf))
3ee86942 91#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
f59cf998 92#define STRINGBUF_INLINE_CHARS(buf) ((unsigned char *) SCM_CELL_OBJECT_LOC(buf,1))
3ee86942
MV
93#define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
94
95#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
96 ? STRINGBUF_INLINE_CHARS (buf) \
97 : STRINGBUF_OUTLINE_CHARS (buf))
50b1996f 98
f59cf998 99#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) SCM_CELL_WORD_1(buf))
3ee86942
MV
100#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
101 ? STRINGBUF_INLINE_LENGTH (buf) \
102 : STRINGBUF_OUTLINE_LENGTH (buf))
103
104#define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
105
106#define SET_STRINGBUF_SHARED(buf) \
107 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
108
6ce6923b 109#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
110static size_t lenhist[1001];
111#endif
112
50b1996f
MG
113/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
114 characters. */
3ee86942
MV
115static SCM
116make_stringbuf (size_t len)
0f2d19dd 117{
3ee86942
MV
118 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
119 scm_i_symbol_chars, all stringbufs are null-terminated. Once
120 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
121 has been changed for scm_i_symbol_chars, this null-termination
122 can be dropped.
123 */
124
6ce6923b 125#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
126 if (len < 1000)
127 lenhist[len]++;
128 else
129 lenhist[1000]++;
130#endif
0f2d19dd 131
3ee86942
MV
132 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
133 {
134 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
135 0, 0, 0);
136 }
137 else
138 {
139 char *mem = scm_gc_malloc (len+1, "string");
140 mem[len] = '\0';
141 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
142 (scm_t_bits) len, (scm_t_bits) 0);
143 }
144}
e53cc817 145
50b1996f
MG
146/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
147 characters. */
9c44cd45
MG
148static SCM
149make_wide_stringbuf (size_t len)
150{
151 scm_t_wchar *mem;
6ce6923b 152#if SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
153 if (len < 1000)
154 lenhist[len]++;
155 else
156 lenhist[1000]++;
157#endif
158
159 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
160 mem[len] = 0;
161 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
162 (scm_t_bits) len, (scm_t_bits) 0);
163}
164
2b829bbb
KR
165/* Return a new stringbuf whose underlying storage consists of the LEN+1
166 octets pointed to by STR (the last octet is zero). */
7f74cf9a 167SCM
fd0a5bbc
HWN
168scm_i_take_stringbufn (char *str, size_t len)
169{
2b829bbb 170 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
fd0a5bbc
HWN
171
172 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
173 (scm_t_bits) len, (scm_t_bits) 0);
174}
175
3ee86942
MV
176SCM
177scm_i_stringbuf_mark (SCM buf)
178{
179 return SCM_BOOL_F;
180}
1bbd0b84 181
3ee86942
MV
182void
183scm_i_stringbuf_free (SCM buf)
0f2d19dd 184{
3ee86942 185 if (!STRINGBUF_INLINE (buf))
9c44cd45
MG
186 {
187 if (!STRINGBUF_WIDE (buf))
188 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
189 STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
190 else
191 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
192 sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
193 + 1), "string");
194 }
195
196}
197
50b1996f
MG
198/* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
199 one containing 32-bit UCS-4-encoded characters. */
9c44cd45
MG
200static void
201widen_stringbuf (SCM buf)
202{
203 size_t i, len;
204 scm_t_wchar *mem;
205
206 if (STRINGBUF_WIDE (buf))
207 return;
208
209 if (STRINGBUF_INLINE (buf))
210 {
211 len = STRINGBUF_INLINE_LENGTH (buf);
212
213 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
214 for (i = 0; i < len; i++)
215 mem[i] =
f59cf998 216 (scm_t_wchar) STRINGBUF_INLINE_CHARS (buf)[i];
9c44cd45
MG
217 mem[len] = 0;
218
219 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
220 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
221 SCM_SET_CELL_WORD_1 (buf, mem);
222 SCM_SET_CELL_WORD_2 (buf, len);
223 }
224 else
225 {
226 len = STRINGBUF_OUTLINE_LENGTH (buf);
227
228 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
229 for (i = 0; i < len; i++)
230 mem[i] =
f59cf998 231 (scm_t_wchar) STRINGBUF_OUTLINE_CHARS (buf)[i];
9c44cd45
MG
232 mem[len] = 0;
233
234 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
235
236 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
237 SCM_SET_CELL_WORD_1 (buf, mem);
238 SCM_SET_CELL_WORD_2 (buf, len);
239 }
3ee86942 240}
bd9e24b3 241
9de87eea 242scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 243
3ee86942
MV
244/* Copy-on-write strings.
245 */
bd9e24b3 246
3ee86942 247#define STRING_TAG scm_tc7_string
bd9e24b3 248
3ee86942
MV
249#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
250#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
251#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 252
3ee86942
MV
253#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
254#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
255
256#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
257
ed35de72
MV
258/* Read-only strings.
259 */
260
261#define RO_STRING_TAG (scm_tc7_string + 0x200)
262#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
263
e1b29f6a
MV
264/* Mutation-sharing substrings
265 */
266
267#define SH_STRING_TAG (scm_tc7_string + 0x100)
268
269#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
270/* START and LENGTH as for STRINGs. */
271
272#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
273
50b1996f
MG
274/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
275 characters. CHARSP, if not NULL, will be set to location of the
276 char array. */
3ee86942
MV
277SCM
278scm_i_make_string (size_t len, char **charsp)
279{
280 SCM buf = make_stringbuf (len);
281 SCM res;
282 if (charsp)
f59cf998 283 *charsp = (char *) STRINGBUF_CHARS (buf);
3ee86942
MV
284 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
285 (scm_t_bits)0, (scm_t_bits) len);
286 return res;
0f2d19dd
JB
287}
288
50b1996f
MG
289/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
290 characters. CHARSP, if not NULL, will be set to location of the
291 character array. */
9c44cd45 292SCM
50b1996f 293scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
294{
295 SCM buf = make_wide_stringbuf (len);
296 SCM res;
297 if (charsp)
298 *charsp = STRINGBUF_WIDE_CHARS (buf);
299 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
300 (scm_t_bits) 0, (scm_t_bits) len);
301 return res;
302}
303
3ee86942
MV
304static void
305validate_substring_args (SCM str, size_t start, size_t end)
306{
307 if (!IS_STRING (str))
308 scm_wrong_type_arg_msg (NULL, 0, str, "string");
309 if (start > STRING_LENGTH (str))
310 scm_out_of_range (NULL, scm_from_size_t (start));
311 if (end > STRING_LENGTH (str) || end < start)
312 scm_out_of_range (NULL, scm_from_size_t (end));
313}
0f2d19dd 314
e1b29f6a
MV
315static inline void
316get_str_buf_start (SCM *str, SCM *buf, size_t *start)
317{
318 *start = STRING_START (*str);
319 if (IS_SH_STRING (*str))
320 {
321 *str = SH_STRING_STRING (*str);
322 *start += STRING_START (*str);
323 }
324 *buf = STRING_STRINGBUF (*str);
325}
326
3ee86942
MV
327SCM
328scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 329{
e1b29f6a
MV
330 SCM buf;
331 size_t str_start;
332 get_str_buf_start (&str, &buf, &str_start);
9de87eea 333 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 334 SET_STRINGBUF_SHARED (buf);
9de87eea 335 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 336 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
337 (scm_t_bits)str_start + start,
338 (scm_t_bits) end - start);
0f2d19dd
JB
339}
340
ed35de72
MV
341SCM
342scm_i_substring_read_only (SCM str, size_t start, size_t end)
343{
45a9f430
LC
344 SCM buf;
345 size_t str_start;
346 get_str_buf_start (&str, &buf, &str_start);
347 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
348 SET_STRINGBUF_SHARED (buf);
349 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
350 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
351 (scm_t_bits)str_start + start,
352 (scm_t_bits) end - start);
ed35de72
MV
353}
354
3ee86942
MV
355SCM
356scm_i_substring_copy (SCM str, size_t start, size_t end)
357{
358 size_t len = end - start;
edea856c 359 SCM buf, my_buf;
e1b29f6a
MV
360 size_t str_start;
361 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
362 if (scm_i_is_narrow_string (str))
363 {
364 my_buf = make_stringbuf (len);
365 memcpy (STRINGBUF_CHARS (my_buf),
366 STRINGBUF_CHARS (buf) + str_start + start, len);
367 }
368 else
369 {
370 my_buf = make_wide_stringbuf (len);
371 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
372 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
373 + start), len);
374 /* Even though this string is wide, the substring may be narrow.
50b1996f 375 Consider adding code to narrow the string. */
9c44cd45 376 }
3ee86942 377 scm_remember_upto_here_1 (buf);
9c44cd45
MG
378 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
379 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 380}
0f2d19dd 381
e1b29f6a
MV
382SCM
383scm_i_substring_shared (SCM str, size_t start, size_t end)
384{
385 if (start == 0 && end == STRING_LENGTH (str))
386 return str;
387 else
388 {
389 size_t len = end - start;
390 if (IS_SH_STRING (str))
391 {
392 start += STRING_START (str);
393 str = SH_STRING_STRING (str);
394 }
395 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
396 (scm_t_bits)start, (scm_t_bits) len);
397 }
398}
399
3ee86942
MV
400SCM
401scm_c_substring (SCM str, size_t start, size_t end)
402{
403 validate_substring_args (str, start, end);
404 return scm_i_substring (str, start, end);
405}
ee149d03 406
ed35de72
MV
407SCM
408scm_c_substring_read_only (SCM str, size_t start, size_t end)
409{
410 validate_substring_args (str, start, end);
411 return scm_i_substring_read_only (str, start, end);
412}
413
0f2d19dd 414SCM
3ee86942 415scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 416{
3ee86942
MV
417 validate_substring_args (str, start, end);
418 return scm_i_substring_copy (str, start, end);
419}
420
3ee86942
MV
421SCM
422scm_c_substring_shared (SCM str, size_t start, size_t end)
423{
424 validate_substring_args (str, start, end);
425 return scm_i_substring_shared (str, start, end);
426}
0f2d19dd 427
ee149d03 428SCM
3ee86942 429scm_i_string_mark (SCM str)
ee149d03 430{
3ee86942
MV
431 if (IS_SH_STRING (str))
432 return SH_STRING_STRING (str);
433 else
434 return STRING_STRINGBUF (str);
ee149d03
JB
435}
436
3ee86942
MV
437void
438scm_i_string_free (SCM str)
439{
440}
36284627 441
3ee86942
MV
442/* Internal accessors
443 */
444
50b1996f
MG
445/* Returns the number of characters in STR. This may be different
446 than the memory size of the string storage. */
3ee86942
MV
447size_t
448scm_i_string_length (SCM str)
0f2d19dd 449{
3ee86942 450 return STRING_LENGTH (str);
0f2d19dd
JB
451}
452
50b1996f
MG
453/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
454 encoding. False if it is 'wide', having a 32-bit UCS-4
455 encoding. */
9c44cd45
MG
456int
457scm_i_is_narrow_string (SCM str)
458{
459 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
460}
461
50b1996f
MG
462/* Returns a pointer to the 8-bit Latin-1 encoded character array of
463 STR. */
3ee86942
MV
464const char *
465scm_i_string_chars (SCM str)
466{
467 SCM buf;
e1b29f6a
MV
468 size_t start;
469 get_str_buf_start (&str, &buf, &start);
9c44cd45 470 if (scm_i_is_narrow_string (str))
f59cf998 471 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
472 else
473 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
474 scm_list_1 (str));
475 return NULL;
3ee86942 476}
b00418df 477
50b1996f
MG
478/* Returns a pointer to the 32-bit UCS-4 encoded character array of
479 STR. */
9c44cd45
MG
480const scm_t_wchar *
481scm_i_string_wide_chars (SCM str)
482{
483 SCM buf;
484 size_t start;
485
486 get_str_buf_start (&str, &buf, &start);
487 if (!scm_i_is_narrow_string (str))
f59cf998 488 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
489 else
490 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
491 scm_list_1 (str));
492}
493
494/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
495 a new string buffer, so that it can be modified without modifying
50b1996f
MG
496 other strings. Also, lock the string mutex. Later, one must call
497 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
498SCM
499scm_i_string_start_writing (SCM orig_str)
b00418df 500{
ed35de72 501 SCM buf, str = orig_str;
e1b29f6a 502 size_t start;
ed35de72 503
e1b29f6a 504 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
505 if (IS_RO_STRING (str))
506 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
507
9de87eea 508 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
509 if (STRINGBUF_SHARED (buf))
510 {
9c44cd45 511 /* Clone the stringbuf. */
3ee86942
MV
512 size_t len = STRING_LENGTH (str);
513 SCM new_buf;
514
9de87eea 515 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 516
9c44cd45
MG
517 if (scm_i_is_narrow_string (str))
518 {
519 new_buf = make_stringbuf (len);
520 memcpy (STRINGBUF_CHARS (new_buf),
521 STRINGBUF_CHARS (buf) + STRING_START (str), len);
522
523 }
524 else
525 {
526 new_buf = make_wide_stringbuf (len);
527 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
528 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
529 + STRING_START (str)), len);
530 }
3ee86942
MV
531 scm_i_thread_put_to_sleep ();
532 SET_STRING_STRINGBUF (str, new_buf);
533 start -= STRING_START (str);
534 SET_STRING_START (str, 0);
535 scm_i_thread_wake_up ();
536
537 buf = new_buf;
538
9de87eea 539 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 540 }
9c44cd45
MG
541 return orig_str;
542}
543
50b1996f 544/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
545char *
546scm_i_string_writable_chars (SCM str)
547{
548 SCM buf;
549 size_t start;
3ee86942 550
9c44cd45
MG
551 get_str_buf_start (&str, &buf, &start);
552 if (scm_i_is_narrow_string (str))
f59cf998 553 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
554 else
555 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
556 scm_list_1 (str));
557 return NULL;
558}
559
50b1996f 560/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
561static scm_t_wchar *
562scm_i_string_writable_wide_chars (SCM str)
563{
564 SCM buf;
565 size_t start;
566
567 get_str_buf_start (&str, &buf, &start);
568 if (!scm_i_is_narrow_string (str))
569 return STRINGBUF_WIDE_CHARS (buf) + start;
570 else
571 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
572 scm_list_1 (str));
b00418df
DH
573}
574
50b1996f
MG
575/* Unlock the string mutex that was locked when
576 scm_i_string_start_writing was called. */
3ee86942
MV
577void
578scm_i_string_stop_writing (void)
579{
9de87eea 580 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 581}
b00418df 582
50b1996f 583/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
584scm_t_wchar
585scm_i_string_ref (SCM str, size_t x)
586{
587 if (scm_i_is_narrow_string (str))
588 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
589 else
590 return scm_i_string_wide_chars (str)[x];
591}
592
50b1996f 593/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
594void
595scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
596{
597 if (chr > 0xFF && scm_i_is_narrow_string (str))
598 widen_stringbuf (STRING_STRINGBUF (str));
599
600 if (scm_i_is_narrow_string (str))
601 {
602 char *dst = scm_i_string_writable_chars (str);
603 dst[p] = (char) (unsigned char) chr;
604 }
605 else
606 {
607 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
608 dst[p] = chr;
609 }
610}
611
3ee86942
MV
612/* Symbols.
613
614 Basic symbol creation and accessing is done here, the rest is in
615 symbols.[hc]. This has been done to keep stringbufs and the
616 internals of strings and string-like objects confined to this file.
617*/
618
619#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
620
621SCM
6869328b
MV
622scm_i_make_symbol (SCM name, scm_t_bits flags,
623 unsigned long hash, SCM props)
3ee86942
MV
624{
625 SCM buf;
626 size_t start = STRING_START (name);
627 size_t length = STRING_LENGTH (name);
628
629 if (IS_SH_STRING (name))
630 {
631 name = SH_STRING_STRING (name);
632 start += STRING_START (name);
633 }
634 buf = SYMBOL_STRINGBUF (name);
635
636 if (start == 0 && length == STRINGBUF_LENGTH (buf))
637 {
638 /* reuse buf. */
9de87eea 639 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 640 SET_STRINGBUF_SHARED (buf);
9de87eea 641 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
642 }
643 else
644 {
645 /* make new buf. */
9c44cd45
MG
646 if (scm_i_is_narrow_string (name))
647 {
648 SCM new_buf = make_stringbuf (length);
649 memcpy (STRINGBUF_CHARS (new_buf),
650 STRINGBUF_CHARS (buf) + start, length);
651 buf = new_buf;
652 }
653 else
654 {
655 SCM new_buf = make_wide_stringbuf (length);
656 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
657 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
658 length);
659 buf = new_buf;
660 }
3ee86942 661 }
6869328b 662 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
663 (scm_t_bits) hash, SCM_UNPACK (props));
664}
665
fd0a5bbc
HWN
666SCM
667scm_i_c_make_symbol (const char *name, size_t len,
668 scm_t_bits flags, unsigned long hash, SCM props)
669{
670 SCM buf = make_stringbuf (len);
671 memcpy (STRINGBUF_CHARS (buf), name, len);
672
673 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
674 (scm_t_bits) hash, SCM_UNPACK (props));
675}
676
677/* Return a new symbol that uses the LEN bytes pointed to by NAME as its
678 underlying storage. */
679SCM
680scm_i_c_take_symbol (char *name, size_t len,
681 scm_t_bits flags, unsigned long hash, SCM props)
682{
683 SCM buf = scm_i_take_stringbufn (name, len);
684
685 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
686 (scm_t_bits) hash, SCM_UNPACK (props));
687}
688
50b1996f
MG
689/* Returns the number of characters in SYM. This may be different
690 from the memory size of SYM. */
3ee86942
MV
691size_t
692scm_i_symbol_length (SCM sym)
0f2d19dd 693{
3ee86942 694 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
695}
696
071bb6a8
LC
697size_t
698scm_c_symbol_length (SCM sym)
699#define FUNC_NAME "scm_c_symbol_length"
700{
701 SCM_VALIDATE_SYMBOL (1, sym);
702
703 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
704}
705#undef FUNC_NAME
706
50b1996f
MG
707/* True if the name of SYM is stored as a Latin-1 encoded string.
708 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
709int
710scm_i_is_narrow_symbol (SCM sym)
711{
712 SCM buf;
713
714 buf = SYMBOL_STRINGBUF (sym);
715 return !STRINGBUF_WIDE (buf);
716}
717
50b1996f
MG
718/* Returns a pointer to the 8-bit Latin-1 encoded character array that
719 contains the name of SYM. */
3ee86942
MV
720const char *
721scm_i_symbol_chars (SCM sym)
722{
9c44cd45
MG
723 SCM buf;
724
725 buf = SYMBOL_STRINGBUF (sym);
726 if (!STRINGBUF_WIDE (buf))
f59cf998 727 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
728 else
729 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
730 scm_list_1 (sym));
731}
732
50b1996f
MG
733/* Return a pointer to the 32-bit UCS-4-encoded character array of a
734 symbol's name. */
9c44cd45
MG
735const scm_t_wchar *
736scm_i_symbol_wide_chars (SCM sym)
737{
738 SCM buf;
739
740 buf = SYMBOL_STRINGBUF (sym);
741 if (STRINGBUF_WIDE (buf))
f59cf998 742 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
743 else
744 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
745 scm_list_1 (sym));
3ee86942 746}
1cc91f1b 747
3ee86942
MV
748SCM
749scm_i_symbol_mark (SCM sym)
0f2d19dd 750{
3ee86942
MV
751 scm_gc_mark (SYMBOL_STRINGBUF (sym));
752 return SCM_CELL_OBJECT_3 (sym);
0f2d19dd
JB
753}
754
3ee86942
MV
755void
756scm_i_symbol_free (SCM sym)
757{
758}
0f2d19dd 759
be54b15d 760SCM
3ee86942 761scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 762{
3ee86942 763 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 764 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 765 SET_STRINGBUF_SHARED (buf);
9de87eea 766 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 767 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
768 (scm_t_bits)start, (scm_t_bits) end - start);
769}
be54b15d 770
50b1996f 771/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
772scm_t_wchar
773scm_i_symbol_ref (SCM sym, size_t x)
774{
775 if (scm_i_is_narrow_symbol (sym))
776 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
777 else
778 return scm_i_symbol_wide_chars (sym)[x];
779}
780
3ee86942
MV
781/* Debugging
782 */
be54b15d 783
6ce6923b
MG
784SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
785 "Returns an association list containing debugging information\n"
786 "for @var{str}. The association list has the following entries."
787 "@table @code\n"
788 "@item string\n"
789 "The string itself.\n"
790 "@item start\n"
791 "The start index of the string into its stringbuf\n"
792 "@item length\n"
793 "The length of the string\n"
794 "@item shared\n"
795 "If this string is a substring, it returns its parent string.\n"
796 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
797 "@item read-only\n"
798 "@code{#t} if the string is read-only\n"
6ce6923b
MG
799 "@item stringbuf-chars\n"
800 "A new string containing this string's stringbuf's characters\n"
801 "@item stringbuf-length\n"
802 "The number of characters in this stringbuf\n"
803 "@item stringbuf-shared\n"
804 "@code{#t} if this stringbuf is shared\n"
805 "@item stringbuf-inline\n"
806 "@code{#t} if this stringbuf's characters are stored in the\n"
807 "cell itself, or @code{#f} if they were allocated in memory\n"
808 "@item stringbuf-wide\n"
809 "@code{#t} if this stringbuf's characters are stored in a\n"
810 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
811 "buffer\n"
812 "@end table")
3ee86942
MV
813#define FUNC_NAME s_scm_sys_string_dump
814{
6ce6923b
MG
815 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
816 SCM buf;
3ee86942 817 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
818
819 /* String info */
820 e1 = scm_cons (scm_from_locale_symbol ("string"),
821 str);
822 e2 = scm_cons (scm_from_locale_symbol ("start"),
823 scm_from_size_t (STRING_START (str)));
824 e3 = scm_cons (scm_from_locale_symbol ("length"),
825 scm_from_size_t (STRING_LENGTH (str)));
826
3ee86942
MV
827 if (IS_SH_STRING (str))
828 {
6ce6923b
MG
829 e4 = scm_cons (scm_from_locale_symbol ("shared"),
830 SH_STRING_STRING (str));
831 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
832 }
833 else
834 {
6ce6923b
MG
835 e4 = scm_cons (scm_from_locale_symbol ("shared"),
836 SCM_BOOL_F);
837 buf = STRING_STRINGBUF (str);
838 }
9c44cd45 839
88ed5759
MG
840 if (IS_RO_STRING (str))
841 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
842 SCM_BOOL_T);
843 else
844 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
845 SCM_BOOL_F);
846
6ce6923b 847 /* Stringbuf info */
6ce6923b
MG
848 if (!STRINGBUF_WIDE (buf))
849 {
850 size_t len = STRINGBUF_LENGTH (buf);
851 char *cbuf;
852 SCM sbc = scm_i_make_string (len, &cbuf);
853 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
854 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
855 sbc);
3ee86942 856 }
6ce6923b
MG
857 else
858 {
859 size_t len = STRINGBUF_LENGTH (buf);
860 scm_t_wchar *cbuf;
861 SCM sbc = scm_i_make_wide_string (len, &cbuf);
862 u32_cpy ((scm_t_uint32 *) cbuf,
863 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
864 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
865 sbc);
866 }
867 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
868 scm_from_size_t (STRINGBUF_LENGTH (buf)));
869 if (STRINGBUF_SHARED (buf))
870 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
871 SCM_BOOL_T);
872 else
873 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
874 SCM_BOOL_F);
875 if (STRINGBUF_INLINE (buf))
876 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
877 SCM_BOOL_T);
878 else
879 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
880 SCM_BOOL_F);
881 if (STRINGBUF_WIDE (buf))
882 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
883 SCM_BOOL_T);
884 else
885 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
886 SCM_BOOL_F);
887
888 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
3ee86942
MV
889}
890#undef FUNC_NAME
891
6ce6923b
MG
892SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
893 "Returns an association list containing debugging information\n"
894 "for @var{sym}. The association list has the following entries."
895 "@table @code\n"
896 "@item symbol\n"
897 "The symbol itself\n"
898 "@item hash\n"
899 "Its hash value\n"
88ed5759
MG
900 "@item interned\n"
901 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
902 "@item stringbuf-chars\n"
903 "A new string containing this symbols's stringbuf's characters\n"
904 "@item stringbuf-length\n"
905 "The number of characters in this stringbuf\n"
906 "@item stringbuf-shared\n"
907 "@code{#t} if this stringbuf is shared\n"
908 "@item stringbuf-inline\n"
909 "@code{#t} if this stringbuf's characters are stored in the\n"
910 "cell itself, or @code{#f} if they were allocated in memory\n"
911 "@item stringbuf-wide\n"
912 "@code{#t} if this stringbuf's characters are stored in a\n"
913 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
914 "buffer\n"
915 "@end table")
3ee86942
MV
916#define FUNC_NAME s_scm_sys_symbol_dump
917{
6ce6923b
MG
918 SCM e1, e2, e3, e4, e5, e6, e7, e8;
919 SCM buf;
3ee86942 920 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
921 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
922 sym);
923 e2 = scm_cons (scm_from_locale_symbol ("hash"),
924 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
925 e3 = scm_cons (scm_from_locale_symbol ("interned"),
926 scm_symbol_interned_p (sym));
6ce6923b
MG
927 buf = SYMBOL_STRINGBUF (sym);
928
929 /* Stringbuf info */
6ce6923b
MG
930 if (!STRINGBUF_WIDE (buf))
931 {
932 size_t len = STRINGBUF_LENGTH (buf);
933 char *cbuf;
934 SCM sbc = scm_i_make_string (len, &cbuf);
935 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
936 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
937 sbc);
938 }
9c44cd45 939 else
6ce6923b
MG
940 {
941 size_t len = STRINGBUF_LENGTH (buf);
942 scm_t_wchar *cbuf;
943 SCM sbc = scm_i_make_wide_string (len, &cbuf);
944 u32_cpy ((scm_t_uint32 *) cbuf,
945 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
946 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
947 sbc);
948 }
949 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
950 scm_from_size_t (STRINGBUF_LENGTH (buf)));
951 if (STRINGBUF_SHARED (buf))
952 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
953 SCM_BOOL_T);
954 else
955 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
956 SCM_BOOL_F);
957 if (STRINGBUF_INLINE (buf))
958 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
959 SCM_BOOL_T);
960 else
961 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
962 SCM_BOOL_F);
963 if (STRINGBUF_WIDE (buf))
964 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
965 SCM_BOOL_T);
966 else
967 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
968 SCM_BOOL_F);
969 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
970
3ee86942
MV
971}
972#undef FUNC_NAME
973
6ce6923b
MG
974#if SCM_STRING_LENGTH_HISTOGRAM
975
9c44cd45 976SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 977#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
978{
979 int i;
980 for (i = 0; i < 1000; i++)
981 if (lenhist[i])
982 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
983 fprintf (stderr, ">999: %u\n", lenhist[1000]);
984 return SCM_UNSPECIFIED;
be54b15d
DH
985}
986#undef FUNC_NAME
987
3ee86942
MV
988#endif
989
990\f
991
992SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
993 (SCM obj),
994 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
995#define FUNC_NAME s_scm_string_p
996{
997 return scm_from_bool (IS_STRING (obj));
998}
999#undef FUNC_NAME
1000
1001
1002SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1003
1004SCM_DEFINE (scm_string, "string", 0, 0, 1,
1005 (SCM chrs),
1006 "@deffnx {Scheme Procedure} list->string chrs\n"
1007 "Return a newly allocated string composed of the arguments,\n"
1008 "@var{chrs}.")
1009#define FUNC_NAME s_scm_string
1010{
1011 SCM result;
9c44cd45 1012 SCM rest;
3ee86942 1013 size_t len;
9c44cd45
MG
1014 size_t p = 0;
1015 long i;
3ee86942 1016
9c44cd45
MG
1017 /* Verify that this is a list of chars. */
1018 i = scm_ilength (chrs);
3c7cf7f5
MG
1019 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1020
9c44cd45
MG
1021 len = (size_t) i;
1022 rest = chrs;
3ee86942 1023
9c44cd45 1024 while (len > 0 && scm_is_pair (rest))
3ee86942 1025 {
9c44cd45 1026 SCM elt = SCM_CAR (rest);
3ee86942 1027 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9c44cd45
MG
1028 rest = SCM_CDR (rest);
1029 len--;
1030 scm_remember_upto_here_1 (elt);
1031 }
1032
1033 /* Construct a string containing this list of chars. */
1034 len = (size_t) i;
1035 rest = chrs;
1036
1037 result = scm_i_make_string (len, NULL);
1038 result = scm_i_string_start_writing (result);
1039 while (len > 0 && scm_is_pair (rest))
1040 {
1041 SCM elt = SCM_CAR (rest);
1042 scm_i_string_set_x (result, p, SCM_CHAR (elt));
1043 p++;
1044 rest = SCM_CDR (rest);
3ee86942 1045 len--;
9c44cd45 1046 scm_remember_upto_here_1 (elt);
3ee86942 1047 }
9c44cd45
MG
1048 scm_i_string_stop_writing ();
1049
3ee86942
MV
1050 if (len > 0)
1051 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1052 if (!scm_is_null (rest))
3ee86942
MV
1053 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1054
1055 return result;
1056}
1057#undef FUNC_NAME
be54b15d 1058
3b3b36dd 1059SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1060 (SCM k, SCM chr),
0d26a824
MG
1061 "Return a newly allocated string of\n"
1062 "length @var{k}. If @var{chr} is given, then all elements of\n"
1063 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1064 "of the @var{string} are unspecified.")
1bbd0b84 1065#define FUNC_NAME s_scm_make_string
0f2d19dd 1066{
3ee86942
MV
1067 return scm_c_make_string (scm_to_size_t (k), chr);
1068}
1069#undef FUNC_NAME
1070
1071SCM
1072scm_c_make_string (size_t len, SCM chr)
1073#define FUNC_NAME NULL
1074{
9c44cd45
MG
1075 size_t p;
1076 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1077
e11e83f3
MV
1078 if (!SCM_UNBNDP (chr))
1079 {
3ee86942 1080 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1081 res = scm_i_string_start_writing (res);
1082 for (p = 0; p < len; p++)
1083 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1084 scm_i_string_stop_writing ();
0f2d19dd 1085 }
e11e83f3
MV
1086
1087 return res;
0f2d19dd 1088}
1bbd0b84 1089#undef FUNC_NAME
0f2d19dd 1090
3b3b36dd 1091SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1092 (SCM string),
1093 "Return the number of characters in @var{string}.")
1bbd0b84 1094#define FUNC_NAME s_scm_string_length
0f2d19dd 1095{
d1ca2c64 1096 SCM_VALIDATE_STRING (1, string);
3ee86942 1097 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1098}
1bbd0b84 1099#undef FUNC_NAME
0f2d19dd 1100
9c44cd45
MG
1101SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
1102 (SCM string),
1103 "Return the bytes used to represent a character in @var{string}."
1104 "This will return 1 or 4.")
1105#define FUNC_NAME s_scm_string_width
1106{
1107 SCM_VALIDATE_STRING (1, string);
1108 if (!scm_i_is_narrow_string (string))
1109 return scm_from_int (4);
1110
1111 return scm_from_int (1);
1112}
1113#undef FUNC_NAME
1114
3ee86942
MV
1115size_t
1116scm_c_string_length (SCM string)
1117{
1118 if (!IS_STRING (string))
1119 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1120 return STRING_LENGTH (string);
1121}
1122
bd9e24b3 1123SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1124 (SCM str, SCM k),
9c44cd45
MG
1125 "Return character @var{k} of @var{str} using zero-origin\n"
1126 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1127#define FUNC_NAME s_scm_string_ref
0f2d19dd 1128{
3ae3166b 1129 size_t len;
a55c2b68 1130 unsigned long idx;
bd9e24b3 1131
d1ca2c64 1132 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1133
1134 len = scm_i_string_length (str);
1135 if (SCM_LIKELY (len > 0))
1136 idx = scm_to_unsigned_integer (k, 0, len - 1);
1137 else
1138 scm_out_of_range (NULL, k);
1139
9c44cd45
MG
1140 if (scm_i_is_narrow_string (str))
1141 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1142 else
1143 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1144}
1bbd0b84 1145#undef FUNC_NAME
0f2d19dd 1146
3ee86942
MV
1147SCM
1148scm_c_string_ref (SCM str, size_t p)
1149{
1150 if (p >= scm_i_string_length (str))
1151 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1152 if (scm_i_is_narrow_string (str))
1153 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1154 else
1155 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1156
3ee86942 1157}
f0942910 1158
3b3b36dd 1159SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1160 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1161 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1162 "an unspecified value. @var{k} must be a valid index of\n"
1163 "@var{str}.")
1bbd0b84 1164#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1165{
3ae3166b 1166 size_t len;
a55c2b68
MV
1167 unsigned long idx;
1168
f0942910 1169 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1170
1171 len = scm_i_string_length (str);
1172 if (SCM_LIKELY (len > 0))
1173 idx = scm_to_unsigned_integer (k, 0, len - 1);
1174 else
1175 scm_out_of_range (NULL, k);
1176
34d19ef6 1177 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1178 str = scm_i_string_start_writing (str);
1179 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1180 scm_i_string_stop_writing ();
1181
0f2d19dd
JB
1182 return SCM_UNSPECIFIED;
1183}
1bbd0b84 1184#undef FUNC_NAME
0f2d19dd 1185
3ee86942
MV
1186void
1187scm_c_string_set_x (SCM str, size_t p, SCM chr)
1188{
1189 if (p >= scm_i_string_length (str))
1190 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1191 str = scm_i_string_start_writing (str);
1192 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1193 scm_i_string_stop_writing ();
3ee86942 1194}
0f2d19dd 1195
3b3b36dd 1196SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1197 (SCM str, SCM start, SCM end),
1198 "Return a newly allocated string formed from the characters\n"
1199 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1200 "ending with index @var{end} (exclusive).\n"
1201 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1202 "exact integers satisfying:\n\n"
1203 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1204#define FUNC_NAME s_scm_substring
0f2d19dd 1205{
3ee86942 1206 size_t len, from, to;
685c0d71 1207
d1ca2c64 1208 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1209 len = scm_i_string_length (str);
1210 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1211 if (SCM_UNBNDP (end))
3ee86942 1212 to = len;
a55c2b68 1213 else
3ee86942
MV
1214 to = scm_to_unsigned_integer (end, from, len);
1215 return scm_i_substring (str, from, to);
0f2d19dd 1216}
1bbd0b84 1217#undef FUNC_NAME
0f2d19dd 1218
ed35de72
MV
1219SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1220 (SCM str, SCM start, SCM end),
1221 "Return a newly allocated string formed from the characters\n"
1222 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1223 "ending with index @var{end} (exclusive).\n"
1224 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1225 "exact integers satisfying:\n"
1226 "\n"
1227 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1228 "\n"
1229 "The returned string is read-only.\n")
1230#define FUNC_NAME s_scm_substring_read_only
1231{
1232 size_t len, from, to;
1233
1234 SCM_VALIDATE_STRING (1, str);
1235 len = scm_i_string_length (str);
1236 from = scm_to_unsigned_integer (start, 0, len);
1237 if (SCM_UNBNDP (end))
1238 to = len;
1239 else
1240 to = scm_to_unsigned_integer (end, from, len);
1241 return scm_i_substring_read_only (str, from, to);
1242}
1243#undef FUNC_NAME
1244
3ee86942
MV
1245SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1246 (SCM str, SCM start, SCM end),
1247 "Return a newly allocated string formed from the characters\n"
1248 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1249 "ending with index @var{end} (exclusive).\n"
1250 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1251 "exact integers satisfying:\n\n"
1252 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1253#define FUNC_NAME s_scm_substring_copy
1254{
e1b29f6a
MV
1255 /* For the Scheme version, START is mandatory, but for the C
1256 version, it is optional. See scm_string_copy in srfi-13.c for a
1257 rationale.
1258 */
1259
1260 size_t from, to;
3ee86942
MV
1261
1262 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1263 scm_i_get_substring_spec (scm_i_string_length (str),
1264 start, &from, end, &to);
3ee86942
MV
1265 return scm_i_substring_copy (str, from, to);
1266}
1267#undef FUNC_NAME
1268
1269SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1270 (SCM str, SCM start, SCM end),
1271 "Return string that indirectly refers to the characters\n"
1272 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1273 "ending with index @var{end} (exclusive).\n"
1274 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1275 "exact integers satisfying:\n\n"
1276 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1277#define FUNC_NAME s_scm_substring_shared
1278{
1279 size_t len, from, to;
1280
1281 SCM_VALIDATE_STRING (1, str);
1282 len = scm_i_string_length (str);
1283 from = scm_to_unsigned_integer (start, 0, len);
1284 if (SCM_UNBNDP (end))
1285 to = len;
1286 else
1287 to = scm_to_unsigned_integer (end, from, len);
1288 return scm_i_substring_shared (str, from, to);
1289}
1290#undef FUNC_NAME
685c0d71 1291
3b3b36dd 1292SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1293 (SCM args),
9c44cd45 1294 "Return a newly allocated string whose characters form the\n"
0d26a824 1295 "concatenation of the given strings, @var{args}.")
1bbd0b84 1296#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1297{
1298 SCM res;
9c44cd45
MG
1299 size_t len = 0;
1300 int wide = 0;
c829a427 1301 SCM l, s;
bd4911ef 1302 size_t i;
9909c395
MG
1303 union
1304 {
1305 char *narrow;
1306 scm_t_wchar *wide;
1307 } data;
af45e3b0
DH
1308
1309 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1310 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1311 {
1312 s = SCM_CAR (l);
1313 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1314 len += scm_i_string_length (s);
1315 if (!scm_i_is_narrow_string (s))
1316 wide = 1;
c829a427 1317 }
9909c395 1318 data.narrow = NULL;
9c44cd45 1319 if (!wide)
9909c395 1320 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1321 else
9909c395 1322 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1323
1324 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1325 {
edea856c 1326 size_t len;
c829a427 1327 s = SCM_CAR (l);
3ee86942 1328 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1329 len = scm_i_string_length (s);
9c44cd45
MG
1330 if (!wide)
1331 {
9909c395
MG
1332 memcpy (data.narrow, scm_i_string_chars (s), len);
1333 data.narrow += len;
9c44cd45
MG
1334 }
1335 else
1336 {
1337 if (scm_i_is_narrow_string (s))
1338 {
1339 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1340 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1341 }
1342 else
9909c395 1343 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1344 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1345 data.wide += len;
9c44cd45 1346 }
c829a427
MV
1347 scm_remember_upto_here_1 (s);
1348 }
0f2d19dd
JB
1349 return res;
1350}
1bbd0b84 1351#undef FUNC_NAME
0f2d19dd 1352
c829a427
MV
1353int
1354scm_is_string (SCM obj)
1355{
3ee86942 1356 return IS_STRING (obj);
c829a427 1357}
24933780 1358
c829a427
MV
1359SCM
1360scm_from_locale_stringn (const char *str, size_t len)
1361{
1362 SCM res;
1363 char *dst;
4d4528e7 1364
9c44cd45 1365 if (len == (size_t) -1)
c829a427 1366 len = strlen (str);
9c44cd45
MG
1367 if (len == 0)
1368 return scm_nullstr;
1369
3ee86942 1370 res = scm_i_make_string (len, &dst);
c829a427
MV
1371 memcpy (dst, str, len);
1372 return res;
1373}
4d4528e7 1374
c829a427
MV
1375SCM
1376scm_from_locale_string (const char *str)
4d4528e7 1377{
9c44cd45
MG
1378 if (str == NULL)
1379 return scm_nullstr;
1380
c829a427
MV
1381 return scm_from_locale_stringn (str, -1);
1382}
4d4528e7 1383
50b1996f
MG
1384/* Create a new scheme string from the C string STR. The memory of
1385 STR may be used directly as storage for the new string. */
c829a427
MV
1386SCM
1387scm_take_locale_stringn (char *str, size_t len)
1388{
48ddf0d9
KR
1389 SCM buf, res;
1390
9c44cd45 1391 if (len == (size_t) -1)
48ddf0d9 1392 len = strlen (str);
c829a427
MV
1393 else
1394 {
48ddf0d9
KR
1395 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1396 often be satisfied from the alignment padding after the block, with
1397 no actual data movement. */
9c44cd45 1398 str = scm_realloc (str, len + 1);
48ddf0d9 1399 str[len] = '\0';
c829a427 1400 }
c829a427 1401
fd0a5bbc 1402 buf = scm_i_take_stringbufn (str, len);
3ee86942 1403 res = scm_double_cell (STRING_TAG,
9c44cd45 1404 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
c829a427
MV
1405 return res;
1406}
1407
48ddf0d9
KR
1408SCM
1409scm_take_locale_string (char *str)
1410{
1411 return scm_take_locale_stringn (str, -1);
1412}
1413
9c44cd45
MG
1414/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1415 and \UXXXXXX. */
1416static void
1417unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1418{
1419 char *before, *after;
1420 size_t i, j;
1421
1422 before = *bufp;
1423 after = *bufp;
1424 i = 0;
1425 j = 0;
1426 while (i < *lenp)
1427 {
1428 if ((i <= *lenp - 6)
1429 && before[i] == '\\'
1430 && before[i + 1] == 'u'
1431 && before[i + 2] == '0' && before[i + 3] == '0')
1432 {
1433 /* Convert \u00NN to \xNN */
1434 after[j] = '\\';
1435 after[j + 1] = 'x';
30a6b9ca
MG
1436 after[j + 2] = tolower ((int) before[i + 4]);
1437 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1438 i += 6;
1439 j += 4;
1440 }
1441 else if ((i <= *lenp - 10)
1442 && before[i] == '\\'
1443 && before[i + 1] == 'U'
1444 && before[i + 2] == '0' && before[i + 3] == '0')
1445 {
1446 /* Convert \U00NNNNNN to \UNNNNNN */
1447 after[j] = '\\';
1448 after[j + 1] = 'U';
30a6b9ca
MG
1449 after[j + 2] = tolower ((int) before[i + 4]);
1450 after[j + 3] = tolower ((int) before[i + 5]);
1451 after[j + 4] = tolower ((int) before[i + 6]);
1452 after[j + 5] = tolower ((int) before[i + 7]);
1453 after[j + 6] = tolower ((int) before[i + 8]);
1454 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1455 i += 10;
1456 j += 8;
1457 }
1458 else
1459 {
1460 after[j] = before[i];
1461 i++;
1462 j++;
1463 }
1464 }
1465 *lenp = j;
1466 after = scm_realloc (after, j);
1467}
1468
c829a427 1469char *
9c44cd45 1470scm_to_locale_stringn (SCM str, size_t * lenp)
c829a427 1471{
9c44cd45
MG
1472 const char *enc;
1473
1474 /* In the future, enc will hold the port's encoding. */
1475 enc = NULL;
1476
eca29b02
MG
1477 return scm_to_stringn (str, lenp, enc,
1478 SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
9c44cd45
MG
1479}
1480
1481/* Low-level scheme to C string conversion function. */
1482char *
1483scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
eca29b02 1484 scm_t_string_failed_conversion_handler handler)
9c44cd45
MG
1485{
1486 static const char iso[11] = "ISO-8859-1";
1487 char *buf;
1488 size_t ilen, len, i;
4d4528e7 1489
3ee86942 1490 if (!scm_is_string (str))
c829a427 1491 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1492 ilen = scm_i_string_length (str);
1493
1494 if (ilen == 0)
1495 {
1496 buf = scm_malloc (1);
1497 buf[0] = '\0';
1498 if (lenp)
1499 *lenp = 0;
1500 return buf;
1501 }
1502
c829a427 1503 if (lenp == NULL)
9c44cd45
MG
1504 for (i = 0; i < ilen; i++)
1505 if (scm_i_string_ref (str, i) == '\0')
1506 scm_misc_error (NULL,
1507 "string contains #\\nul character: ~S",
1508 scm_list_1 (str));
1509
1510 if (scm_i_is_narrow_string (str))
c829a427 1511 {
9c44cd45
MG
1512 if (lenp)
1513 {
1514 buf = scm_malloc (ilen);
1515 memcpy (buf, scm_i_string_chars (str), ilen);
1516 *lenp = ilen;
1517 return buf;
1518 }
1519 else
1520 {
1521 buf = scm_malloc (ilen + 1);
1522 memcpy (buf, scm_i_string_chars (str), ilen);
1523 buf[ilen] = '\0';
1524 return buf;
1525 }
c829a427 1526 }
9c44cd45
MG
1527
1528
1529 buf = NULL;
1530 len = 0;
1531 buf = u32_conv_to_encoding (iso,
eca29b02 1532 (enum iconv_ilseq_handler) handler,
9c44cd45
MG
1533 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1534 ilen, NULL, NULL, &len);
1535 if (buf == NULL)
1536 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1537 scm_list_2 (scm_from_locale_string (iso), str));
1538
eca29b02 1539 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
9c44cd45
MG
1540 unistring_escapes_to_guile_escapes (&buf, &len);
1541
1542 if (lenp)
4d4528e7 1543 *lenp = len;
9c44cd45
MG
1544 else
1545 {
1546 buf = scm_realloc (buf, len + 1);
1547 buf[len] = '\0';
1548 }
24933780 1549
c829a427 1550 scm_remember_upto_here_1 (str);
9c44cd45 1551 return buf;
4d4528e7 1552}
af68e5e5 1553
c829a427
MV
1554char *
1555scm_to_locale_string (SCM str)
1556{
1557 return scm_to_locale_stringn (str, NULL);
1558}
af68e5e5 1559
c829a427
MV
1560size_t
1561scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1562{
1563 size_t len;
9c44cd45 1564 char *result = NULL;
3ee86942 1565 if (!scm_is_string (str))
c829a427 1566 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1567 result = scm_to_locale_stringn (str, &len);
1568
1569 memcpy (buf, result, (len > max_len) ? max_len : len);
1570 free (result);
1571
c829a427
MV
1572 scm_remember_upto_here_1 (str);
1573 return len;
1574}
af68e5e5 1575
3ee86942
MV
1576/* converts C scm_array of strings to SCM scm_list of strings. */
1577/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1578SCM
3ee86942
MV
1579scm_makfromstrs (int argc, char **argv)
1580{
1581 int i = argc;
1582 SCM lst = SCM_EOL;
1583 if (0 > i)
1584 for (i = 0; argv[i]; i++);
1585 while (i--)
1586 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1587 return lst;
1588}
1589
c829a427
MV
1590/* Return a newly allocated array of char pointers to each of the strings
1591 in args, with a terminating NULL pointer. */
1592
1593char **
1594scm_i_allocate_string_pointers (SCM list)
af68e5e5 1595{
c829a427
MV
1596 char **result;
1597 int len = scm_ilength (list);
1598 int i;
1599
1600 if (len < 0)
1601 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1602
661ae7ab 1603 scm_dynwind_begin (0);
c829a427
MV
1604
1605 result = (char **) scm_malloc ((len + 1) * sizeof (char *));
1606 result[len] = NULL;
661ae7ab 1607 scm_dynwind_unwind_handler (free, result, 0);
c829a427
MV
1608
1609 /* The list might be have been modified in another thread, so
1610 we check LIST before each access.
1611 */
d2e53ed6 1612 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427
MV
1613 {
1614 result[i] = scm_to_locale_string (SCM_CAR (list));
1615 list = SCM_CDR (list);
1616 }
1617
661ae7ab 1618 scm_dynwind_end ();
c829a427 1619 return result;
af68e5e5 1620}
e53cc817 1621
c829a427
MV
1622void
1623scm_i_free_string_pointers (char **pointers)
1624{
1625 int i;
1626
1627 for (i = 0; pointers[i]; i++)
1628 free (pointers[i]);
1629 free (pointers);
1630}
24933780 1631
6f14f578
MV
1632void
1633scm_i_get_substring_spec (size_t len,
1634 SCM start, size_t *cstart,
1635 SCM end, size_t *cend)
1636{
1637 if (SCM_UNBNDP (start))
1638 *cstart = 0;
1639 else
1640 *cstart = scm_to_unsigned_integer (start, 0, len);
1641
1642 if (SCM_UNBNDP (end))
1643 *cend = len;
1644 else
1645 *cend = scm_to_unsigned_integer (end, *cstart, len);
1646}
1647
3ee86942
MV
1648#if SCM_ENABLE_DEPRECATED
1649
556d75db
MV
1650/* When these definitions are removed, it becomes reasonable to use
1651 read-only strings for string literals. For that, change the reader
1652 to create string literals with scm_c_substring_read_only instead of
1653 with scm_c_substring_copy.
1654*/
1655
3ee86942 1656int
fe78c51a 1657scm_i_deprecated_stringp (SCM str)
3ee86942
MV
1658{
1659 scm_c_issue_deprecation_warning
1660 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1661
2616f0e0 1662 return scm_is_string (str);
3ee86942
MV
1663}
1664
1665char *
fe78c51a 1666scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
1667{
1668 char *chars;
1669
1670 scm_c_issue_deprecation_warning
1671 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1672
2616f0e0
MV
1673 /* We don't accept shared substrings here since they are not
1674 null-terminated.
1675 */
1676 if (IS_SH_STRING (str))
1677 scm_misc_error (NULL,
1678 "SCM_STRING_CHARS does not work with shared substrings.",
1679 SCM_EOL);
1680
877f06c3 1681 /* We explicitly test for read-only strings to produce a better
556d75db
MV
1682 error message.
1683 */
1684
1685 if (IS_RO_STRING (str))
1686 scm_misc_error (NULL,
1687 "SCM_STRING_CHARS does not work with read-only strings.",
1688 SCM_EOL);
1689
2616f0e0 1690 /* The following is still wrong, of course...
3ee86942 1691 */
9c44cd45 1692 str = scm_i_string_start_writing (str);
3ee86942
MV
1693 chars = scm_i_string_writable_chars (str);
1694 scm_i_string_stop_writing ();
1695 return chars;
1696}
1697
1698size_t
fe78c51a 1699scm_i_deprecated_string_length (SCM str)
3ee86942
MV
1700{
1701 scm_c_issue_deprecation_warning
1702 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1703 return scm_c_string_length (str);
1704}
1705
1706#endif
1707
0f2d19dd
JB
1708void
1709scm_init_strings ()
0f2d19dd 1710{
3ee86942 1711 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 1712
a0599745 1713#include "libguile/strings.x"
0f2d19dd
JB
1714}
1715
89e00824
ML
1716
1717/*
1718 Local Variables:
1719 c-file-style: "gnu"
1720 End:
1721*/