Remove the `scm_tc_free_cell' SMOB type.
[bpt/guile.git] / libguile / strings.c
CommitLineData
2a776823 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45
MG
27#include <ctype.h>
28#include <unistr.h>
eca29b02 29#include <uniconv.h>
faf2c9d7 30
a0599745
MD
31#include "libguile/_scm.h"
32#include "libguile/chars.h"
7c33806a 33#include "libguile/root.h"
a0599745 34#include "libguile/strings.h"
1afff620 35#include "libguile/deprecation.h"
a0599745 36#include "libguile/validate.h"
1afff620 37
0f2d19dd
JB
38\f
39
40/* {Strings}
41 */
42
3ee86942
MV
43
44/* Stringbufs
45 *
46 * XXX - keeping an accurate refcount during GC seems to be quite
47 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 48 * shared, not whether it definitely is.
3ee86942
MV
49 *
50 * The scheme I (mvo) tried to keep an accurate reference count would
51 * recount all strings that point to a stringbuf during the mark-phase
52 * of the GC. This was done since one cannot access the stringbuf of
53 * a string when that string is freed (in order to decrease the
54 * reference count). The memory of the stringbuf might have been
55 * reused already for something completely different.
56 *
57 * This recounted worked for a small number of threads beating on
58 * cow-strings, but it failed randomly with more than 10 threads, say.
59 * I couldn't figure out what went wrong, so I used the conservative
60 * approach implemented below.
61 *
62 * A stringbuf needs to know its length, but only so that it can be
63 * reported when the stringbuf is freed.
64 *
50b1996f
MG
65 * There are 3 storage strategies for stringbufs: inline, outline, and
66 * wide.
67 *
68 * Inline strings are small 8-bit strings stored within the double
69 * cell itself. Outline strings are larger 8-bit strings with GC
70 * allocated storage. Wide strings are 32-bit strings with allocated
71 * storage.
72 *
73 * There was little value in making wide string inlineable, since
74 * there is only room for three inlined 32-bit characters. Thus wide
75 * stringbufs are never inlined.
3ee86942
MV
76 */
77
78#define STRINGBUF_F_SHARED 0x100
79#define STRINGBUF_F_INLINE 0x200
50b1996f
MG
80#define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
81 encoding. Otherwise, strings
82 are Latin-1. */
3ee86942
MV
83
84#define STRINGBUF_TAG scm_tc7_stringbuf
85#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
86#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
9c44cd45 87#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942
MV
88
89#define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
90#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
91#define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
92#define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
93
94#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
95 ? STRINGBUF_INLINE_CHARS (buf) \
96 : STRINGBUF_OUTLINE_CHARS (buf))
50b1996f 97
9c44cd45 98#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
3ee86942
MV
99#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
100 ? STRINGBUF_INLINE_LENGTH (buf) \
101 : STRINGBUF_OUTLINE_LENGTH (buf))
102
103#define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
104
105#define SET_STRINGBUF_SHARED(buf) \
106 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
107
6ce6923b 108#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
109static size_t lenhist[1001];
110#endif
111
50b1996f
MG
112/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
113 characters. */
3ee86942
MV
114static SCM
115make_stringbuf (size_t len)
0f2d19dd 116{
3ee86942
MV
117 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
118 scm_i_symbol_chars, all stringbufs are null-terminated. Once
119 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
120 has been changed for scm_i_symbol_chars, this null-termination
121 can be dropped.
122 */
123
6ce6923b 124#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
125 if (len < 1000)
126 lenhist[len]++;
127 else
128 lenhist[1000]++;
129#endif
0f2d19dd 130
3ee86942
MV
131 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
132 {
133 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
134 0, 0, 0);
135 }
136 else
137 {
c5018a2b 138 char *mem = scm_gc_malloc_pointerless (len + 1, "string");
3ee86942
MV
139 mem[len] = '\0';
140 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
141 (scm_t_bits) len, (scm_t_bits) 0);
142 }
143}
e53cc817 144
50b1996f
MG
145/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 characters. */
9c44cd45
MG
147static SCM
148make_wide_stringbuf (size_t len)
149{
150 scm_t_wchar *mem;
6ce6923b 151#if SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156#endif
157
158 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
159 mem[len] = 0;
160 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
161 (scm_t_bits) len, (scm_t_bits) 0);
162}
163
2b829bbb
KR
164/* Return a new stringbuf whose underlying storage consists of the LEN+1
165 octets pointed to by STR (the last octet is zero). */
7f74cf9a 166SCM
fd0a5bbc
HWN
167scm_i_take_stringbufn (char *str, size_t len)
168{
2b829bbb 169 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
fd0a5bbc
HWN
170
171 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
172 (scm_t_bits) len, (scm_t_bits) 0);
173}
174
50b1996f
MG
175/* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
176 one containing 32-bit UCS-4-encoded characters. */
9c44cd45
MG
177static void
178widen_stringbuf (SCM buf)
179{
180 size_t i, len;
181 scm_t_wchar *mem;
182
183 if (STRINGBUF_WIDE (buf))
184 return;
185
186 if (STRINGBUF_INLINE (buf))
187 {
188 len = STRINGBUF_INLINE_LENGTH (buf);
189
190 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
191 for (i = 0; i < len; i++)
192 mem[i] =
193 (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
194 mem[len] = 0;
195
196 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
197 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
198 SCM_SET_CELL_WORD_1 (buf, mem);
199 SCM_SET_CELL_WORD_2 (buf, len);
200 }
201 else
202 {
203 len = STRINGBUF_OUTLINE_LENGTH (buf);
204
205 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
206 for (i = 0; i < len; i++)
207 mem[i] =
208 (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
209 mem[len] = 0;
210
211 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
212
213 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
214 SCM_SET_CELL_WORD_1 (buf, mem);
215 SCM_SET_CELL_WORD_2 (buf, len);
216 }
3ee86942 217}
bd9e24b3 218
9de87eea 219scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 220
3ee86942
MV
221/* Copy-on-write strings.
222 */
bd9e24b3 223
3ee86942 224#define STRING_TAG scm_tc7_string
bd9e24b3 225
3ee86942
MV
226#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
227#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
228#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 229
3ee86942
MV
230#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
231#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
232
233#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
234
ed35de72
MV
235/* Read-only strings.
236 */
237
238#define RO_STRING_TAG (scm_tc7_string + 0x200)
239#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
240
e1b29f6a
MV
241/* Mutation-sharing substrings
242 */
243
244#define SH_STRING_TAG (scm_tc7_string + 0x100)
245
246#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
247/* START and LENGTH as for STRINGs. */
248
249#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
250
50b1996f
MG
251/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
252 characters. CHARSP, if not NULL, will be set to location of the
253 char array. */
3ee86942
MV
254SCM
255scm_i_make_string (size_t len, char **charsp)
256{
257 SCM buf = make_stringbuf (len);
258 SCM res;
259 if (charsp)
260 *charsp = STRINGBUF_CHARS (buf);
261 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
262 (scm_t_bits)0, (scm_t_bits) len);
263 return res;
0f2d19dd
JB
264}
265
50b1996f
MG
266/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
267 characters. CHARSP, if not NULL, will be set to location of the
268 character array. */
9c44cd45 269SCM
50b1996f 270scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
271{
272 SCM buf = make_wide_stringbuf (len);
273 SCM res;
274 if (charsp)
275 *charsp = STRINGBUF_WIDE_CHARS (buf);
276 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
277 (scm_t_bits) 0, (scm_t_bits) len);
278 return res;
279}
280
3ee86942
MV
281static void
282validate_substring_args (SCM str, size_t start, size_t end)
283{
284 if (!IS_STRING (str))
285 scm_wrong_type_arg_msg (NULL, 0, str, "string");
286 if (start > STRING_LENGTH (str))
287 scm_out_of_range (NULL, scm_from_size_t (start));
288 if (end > STRING_LENGTH (str) || end < start)
289 scm_out_of_range (NULL, scm_from_size_t (end));
290}
0f2d19dd 291
e1b29f6a
MV
292static inline void
293get_str_buf_start (SCM *str, SCM *buf, size_t *start)
294{
295 *start = STRING_START (*str);
296 if (IS_SH_STRING (*str))
297 {
298 *str = SH_STRING_STRING (*str);
299 *start += STRING_START (*str);
300 }
301 *buf = STRING_STRINGBUF (*str);
302}
303
3ee86942
MV
304SCM
305scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 306{
e1b29f6a
MV
307 SCM buf;
308 size_t str_start;
309 get_str_buf_start (&str, &buf, &str_start);
9de87eea 310 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 311 SET_STRINGBUF_SHARED (buf);
9de87eea 312 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 313 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
314 (scm_t_bits)str_start + start,
315 (scm_t_bits) end - start);
0f2d19dd
JB
316}
317
ed35de72
MV
318SCM
319scm_i_substring_read_only (SCM str, size_t start, size_t end)
320{
45a9f430
LC
321 SCM buf;
322 size_t str_start;
323 get_str_buf_start (&str, &buf, &str_start);
324 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
325 SET_STRINGBUF_SHARED (buf);
326 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
327 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
328 (scm_t_bits)str_start + start,
329 (scm_t_bits) end - start);
ed35de72
MV
330}
331
3ee86942
MV
332SCM
333scm_i_substring_copy (SCM str, size_t start, size_t end)
334{
335 size_t len = end - start;
edea856c 336 SCM buf, my_buf;
e1b29f6a
MV
337 size_t str_start;
338 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
339 if (scm_i_is_narrow_string (str))
340 {
341 my_buf = make_stringbuf (len);
342 memcpy (STRINGBUF_CHARS (my_buf),
343 STRINGBUF_CHARS (buf) + str_start + start, len);
344 }
345 else
346 {
347 my_buf = make_wide_stringbuf (len);
348 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
349 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
350 + start), len);
351 /* Even though this string is wide, the substring may be narrow.
50b1996f 352 Consider adding code to narrow the string. */
9c44cd45 353 }
3ee86942 354 scm_remember_upto_here_1 (buf);
9c44cd45
MG
355 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
356 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 357}
0f2d19dd 358
e1b29f6a
MV
359SCM
360scm_i_substring_shared (SCM str, size_t start, size_t end)
361{
362 if (start == 0 && end == STRING_LENGTH (str))
363 return str;
364 else
365 {
366 size_t len = end - start;
367 if (IS_SH_STRING (str))
368 {
369 start += STRING_START (str);
370 str = SH_STRING_STRING (str);
371 }
372 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
373 (scm_t_bits)start, (scm_t_bits) len);
374 }
375}
376
3ee86942
MV
377SCM
378scm_c_substring (SCM str, size_t start, size_t end)
379{
380 validate_substring_args (str, start, end);
381 return scm_i_substring (str, start, end);
382}
ee149d03 383
ed35de72
MV
384SCM
385scm_c_substring_read_only (SCM str, size_t start, size_t end)
386{
387 validate_substring_args (str, start, end);
388 return scm_i_substring_read_only (str, start, end);
389}
390
0f2d19dd 391SCM
3ee86942 392scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 393{
3ee86942
MV
394 validate_substring_args (str, start, end);
395 return scm_i_substring_copy (str, start, end);
396}
397
3ee86942
MV
398SCM
399scm_c_substring_shared (SCM str, size_t start, size_t end)
400{
401 validate_substring_args (str, start, end);
402 return scm_i_substring_shared (str, start, end);
403}
0f2d19dd 404
d6c74168 405\f
3ee86942
MV
406/* Internal accessors
407 */
408
50b1996f
MG
409/* Returns the number of characters in STR. This may be different
410 than the memory size of the string storage. */
3ee86942
MV
411size_t
412scm_i_string_length (SCM str)
0f2d19dd 413{
3ee86942 414 return STRING_LENGTH (str);
0f2d19dd
JB
415}
416
50b1996f
MG
417/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
418 encoding. False if it is 'wide', having a 32-bit UCS-4
419 encoding. */
9c44cd45
MG
420int
421scm_i_is_narrow_string (SCM str)
422{
423 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
424}
425
50b1996f
MG
426/* Returns a pointer to the 8-bit Latin-1 encoded character array of
427 STR. */
3ee86942
MV
428const char *
429scm_i_string_chars (SCM str)
430{
431 SCM buf;
e1b29f6a
MV
432 size_t start;
433 get_str_buf_start (&str, &buf, &start);
9c44cd45
MG
434 if (scm_i_is_narrow_string (str))
435 return STRINGBUF_CHARS (buf) + start;
436 else
437 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
438 scm_list_1 (str));
439 return NULL;
3ee86942 440}
b00418df 441
50b1996f
MG
442/* Returns a pointer to the 32-bit UCS-4 encoded character array of
443 STR. */
9c44cd45
MG
444const scm_t_wchar *
445scm_i_string_wide_chars (SCM str)
446{
447 SCM buf;
448 size_t start;
449
450 get_str_buf_start (&str, &buf, &start);
451 if (!scm_i_is_narrow_string (str))
452 return STRINGBUF_WIDE_CHARS (buf) + start;
453 else
454 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
455 scm_list_1 (str));
456}
457
458/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
459 a new string buffer, so that it can be modified without modifying
50b1996f
MG
460 other strings. Also, lock the string mutex. Later, one must call
461 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
462SCM
463scm_i_string_start_writing (SCM orig_str)
b00418df 464{
ed35de72 465 SCM buf, str = orig_str;
e1b29f6a 466 size_t start;
ed35de72 467
e1b29f6a 468 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
469 if (IS_RO_STRING (str))
470 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
471
9de87eea 472 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
473 if (STRINGBUF_SHARED (buf))
474 {
9c44cd45 475 /* Clone the stringbuf. */
3ee86942
MV
476 size_t len = STRING_LENGTH (str);
477 SCM new_buf;
478
9de87eea 479 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 480
9c44cd45
MG
481 if (scm_i_is_narrow_string (str))
482 {
483 new_buf = make_stringbuf (len);
484 memcpy (STRINGBUF_CHARS (new_buf),
485 STRINGBUF_CHARS (buf) + STRING_START (str), len);
486
487 }
488 else
489 {
490 new_buf = make_wide_stringbuf (len);
491 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
492 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
493 + STRING_START (str)), len);
494 }
3ee86942 495
3ee86942
MV
496 SET_STRING_STRINGBUF (str, new_buf);
497 start -= STRING_START (str);
902578f1
LC
498
499 /* FIXME: The following operations are not atomic, so other threads
500 looking at STR may see an inconsistent state. Nevertheless it can't
501 hurt much since (i) accessing STR while it is being mutated can't
502 yield a crash, and (ii) concurrent accesses to STR should be
503 protected by a mutex at the application level. The latter may not
504 apply when STR != ORIG_STR, though. */
3ee86942 505 SET_STRING_START (str, 0);
902578f1 506 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
507
508 buf = new_buf;
509
9de87eea 510 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 511 }
9c44cd45
MG
512 return orig_str;
513}
3ee86942 514
50b1996f 515/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
516char *
517scm_i_string_writable_chars (SCM str)
518{
519 SCM buf;
520 size_t start;
3ee86942 521
9c44cd45
MG
522 get_str_buf_start (&str, &buf, &start);
523 if (scm_i_is_narrow_string (str))
524 return STRINGBUF_CHARS (buf) + start;
525 else
526 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
527 scm_list_1 (str));
528 return NULL;
b00418df
DH
529}
530
50b1996f 531/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
532static scm_t_wchar *
533scm_i_string_writable_wide_chars (SCM str)
534{
535 SCM buf;
536 size_t start;
537
538 get_str_buf_start (&str, &buf, &start);
539 if (!scm_i_is_narrow_string (str))
540 return STRINGBUF_WIDE_CHARS (buf) + start;
541 else
542 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
543 scm_list_1 (str));
b00418df
DH
544}
545
50b1996f
MG
546/* Unlock the string mutex that was locked when
547 scm_i_string_start_writing was called. */
3ee86942
MV
548void
549scm_i_string_stop_writing (void)
550{
9de87eea 551 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 552}
b00418df 553
50b1996f 554/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
555scm_t_wchar
556scm_i_string_ref (SCM str, size_t x)
557{
558 if (scm_i_is_narrow_string (str))
559 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
560 else
561 return scm_i_string_wide_chars (str)[x];
562}
563
50b1996f 564/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
565void
566scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
567{
568 if (chr > 0xFF && scm_i_is_narrow_string (str))
569 widen_stringbuf (STRING_STRINGBUF (str));
570
571 if (scm_i_is_narrow_string (str))
572 {
573 char *dst = scm_i_string_writable_chars (str);
574 dst[p] = (char) (unsigned char) chr;
575 }
576 else
577 {
578 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
579 dst[p] = chr;
580 }
581}
582
3ee86942
MV
583/* Symbols.
584
585 Basic symbol creation and accessing is done here, the rest is in
586 symbols.[hc]. This has been done to keep stringbufs and the
587 internals of strings and string-like objects confined to this file.
588*/
589
590#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
591
592SCM
6869328b
MV
593scm_i_make_symbol (SCM name, scm_t_bits flags,
594 unsigned long hash, SCM props)
3ee86942
MV
595{
596 SCM buf;
597 size_t start = STRING_START (name);
598 size_t length = STRING_LENGTH (name);
599
600 if (IS_SH_STRING (name))
601 {
602 name = SH_STRING_STRING (name);
603 start += STRING_START (name);
604 }
605 buf = SYMBOL_STRINGBUF (name);
606
607 if (start == 0 && length == STRINGBUF_LENGTH (buf))
608 {
609 /* reuse buf. */
9de87eea 610 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 611 SET_STRINGBUF_SHARED (buf);
9de87eea 612 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
613 }
614 else
615 {
616 /* make new buf. */
9c44cd45
MG
617 if (scm_i_is_narrow_string (name))
618 {
619 SCM new_buf = make_stringbuf (length);
620 memcpy (STRINGBUF_CHARS (new_buf),
621 STRINGBUF_CHARS (buf) + start, length);
622 buf = new_buf;
623 }
624 else
625 {
626 SCM new_buf = make_wide_stringbuf (length);
627 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
628 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
629 length);
630 buf = new_buf;
631 }
3ee86942 632 }
6869328b 633 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
634 (scm_t_bits) hash, SCM_UNPACK (props));
635}
636
fd0a5bbc
HWN
637SCM
638scm_i_c_make_symbol (const char *name, size_t len,
639 scm_t_bits flags, unsigned long hash, SCM props)
640{
641 SCM buf = make_stringbuf (len);
642 memcpy (STRINGBUF_CHARS (buf), name, len);
643
a284cc7e
LC
644 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
645 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
646}
647
648/* Return a new symbol that uses the LEN bytes pointed to by NAME as its
649 underlying storage. */
650SCM
651scm_i_c_take_symbol (char *name, size_t len,
652 scm_t_bits flags, unsigned long hash, SCM props)
653{
654 SCM buf = scm_i_take_stringbufn (name, len);
655
656 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
657 (scm_t_bits) hash, SCM_UNPACK (props));
658}
659
50b1996f
MG
660/* Returns the number of characters in SYM. This may be different
661 from the memory size of SYM. */
3ee86942
MV
662size_t
663scm_i_symbol_length (SCM sym)
0f2d19dd 664{
3ee86942 665 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
666}
667
071bb6a8
LC
668size_t
669scm_c_symbol_length (SCM sym)
670#define FUNC_NAME "scm_c_symbol_length"
671{
672 SCM_VALIDATE_SYMBOL (1, sym);
673
674 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
675}
676#undef FUNC_NAME
677
50b1996f
MG
678/* True if the name of SYM is stored as a Latin-1 encoded string.
679 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
680int
681scm_i_is_narrow_symbol (SCM sym)
682{
683 SCM buf;
684
685 buf = SYMBOL_STRINGBUF (sym);
686 return !STRINGBUF_WIDE (buf);
687}
688
50b1996f
MG
689/* Returns a pointer to the 8-bit Latin-1 encoded character array that
690 contains the name of SYM. */
3ee86942
MV
691const char *
692scm_i_symbol_chars (SCM sym)
693{
9c44cd45
MG
694 SCM buf;
695
696 buf = SYMBOL_STRINGBUF (sym);
697 if (!STRINGBUF_WIDE (buf))
698 return STRINGBUF_CHARS (buf);
699 else
700 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
701 scm_list_1 (sym));
702}
703
50b1996f
MG
704/* Return a pointer to the 32-bit UCS-4-encoded character array of a
705 symbol's name. */
9c44cd45
MG
706const scm_t_wchar *
707scm_i_symbol_wide_chars (SCM sym)
708{
709 SCM buf;
710
711 buf = SYMBOL_STRINGBUF (sym);
712 if (STRINGBUF_WIDE (buf))
713 return STRINGBUF_WIDE_CHARS (buf);
714 else
715 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
716 scm_list_1 (sym));
3ee86942 717}
1cc91f1b 718
be54b15d 719SCM
3ee86942 720scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 721{
3ee86942 722 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 723 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 724 SET_STRINGBUF_SHARED (buf);
9de87eea 725 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 726 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
727 (scm_t_bits)start, (scm_t_bits) end - start);
728}
be54b15d 729
50b1996f 730/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
731scm_t_wchar
732scm_i_symbol_ref (SCM sym, size_t x)
733{
734 if (scm_i_is_narrow_symbol (sym))
735 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
736 else
737 return scm_i_symbol_wide_chars (sym)[x];
738}
739
3ee86942
MV
740/* Debugging
741 */
be54b15d 742
6ce6923b
MG
743SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
744 "Returns an association list containing debugging information\n"
745 "for @var{str}. The association list has the following entries."
746 "@table @code\n"
747 "@item string\n"
748 "The string itself.\n"
749 "@item start\n"
750 "The start index of the string into its stringbuf\n"
751 "@item length\n"
752 "The length of the string\n"
753 "@item shared\n"
754 "If this string is a substring, it returns its parent string.\n"
755 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
756 "@item read-only\n"
757 "@code{#t} if the string is read-only\n"
6ce6923b
MG
758 "@item stringbuf-chars\n"
759 "A new string containing this string's stringbuf's characters\n"
760 "@item stringbuf-length\n"
761 "The number of characters in this stringbuf\n"
762 "@item stringbuf-shared\n"
763 "@code{#t} if this stringbuf is shared\n"
764 "@item stringbuf-inline\n"
765 "@code{#t} if this stringbuf's characters are stored in the\n"
766 "cell itself, or @code{#f} if they were allocated in memory\n"
767 "@item stringbuf-wide\n"
768 "@code{#t} if this stringbuf's characters are stored in a\n"
769 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
770 "buffer\n"
771 "@end table")
3ee86942
MV
772#define FUNC_NAME s_scm_sys_string_dump
773{
6ce6923b
MG
774 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
775 SCM buf;
3ee86942 776 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
777
778 /* String info */
779 e1 = scm_cons (scm_from_locale_symbol ("string"),
780 str);
781 e2 = scm_cons (scm_from_locale_symbol ("start"),
782 scm_from_size_t (STRING_START (str)));
783 e3 = scm_cons (scm_from_locale_symbol ("length"),
784 scm_from_size_t (STRING_LENGTH (str)));
785
3ee86942
MV
786 if (IS_SH_STRING (str))
787 {
6ce6923b
MG
788 e4 = scm_cons (scm_from_locale_symbol ("shared"),
789 SH_STRING_STRING (str));
790 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
791 }
792 else
793 {
6ce6923b
MG
794 e4 = scm_cons (scm_from_locale_symbol ("shared"),
795 SCM_BOOL_F);
796 buf = STRING_STRINGBUF (str);
3ee86942 797 }
9c44cd45 798
88ed5759
MG
799 if (IS_RO_STRING (str))
800 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
801 SCM_BOOL_T);
802 else
803 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
804 SCM_BOOL_F);
805
6ce6923b 806 /* Stringbuf info */
6ce6923b
MG
807 if (!STRINGBUF_WIDE (buf))
808 {
809 size_t len = STRINGBUF_LENGTH (buf);
810 char *cbuf;
811 SCM sbc = scm_i_make_string (len, &cbuf);
812 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
813 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
814 sbc);
3ee86942 815 }
6ce6923b
MG
816 else
817 {
818 size_t len = STRINGBUF_LENGTH (buf);
819 scm_t_wchar *cbuf;
820 SCM sbc = scm_i_make_wide_string (len, &cbuf);
821 u32_cpy ((scm_t_uint32 *) cbuf,
822 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
823 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
824 sbc);
825 }
826 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
827 scm_from_size_t (STRINGBUF_LENGTH (buf)));
828 if (STRINGBUF_SHARED (buf))
829 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
830 SCM_BOOL_T);
831 else
832 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
833 SCM_BOOL_F);
834 if (STRINGBUF_INLINE (buf))
835 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
836 SCM_BOOL_T);
837 else
838 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
839 SCM_BOOL_F);
840 if (STRINGBUF_WIDE (buf))
841 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
842 SCM_BOOL_T);
843 else
844 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
845 SCM_BOOL_F);
846
847 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
3ee86942
MV
848}
849#undef FUNC_NAME
850
6ce6923b
MG
851SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
852 "Returns an association list containing debugging information\n"
853 "for @var{sym}. The association list has the following entries."
854 "@table @code\n"
855 "@item symbol\n"
856 "The symbol itself\n"
857 "@item hash\n"
858 "Its hash value\n"
88ed5759
MG
859 "@item interned\n"
860 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
861 "@item stringbuf-chars\n"
862 "A new string containing this symbols's stringbuf's characters\n"
863 "@item stringbuf-length\n"
864 "The number of characters in this stringbuf\n"
865 "@item stringbuf-shared\n"
866 "@code{#t} if this stringbuf is shared\n"
867 "@item stringbuf-inline\n"
868 "@code{#t} if this stringbuf's characters are stored in the\n"
869 "cell itself, or @code{#f} if they were allocated in memory\n"
870 "@item stringbuf-wide\n"
871 "@code{#t} if this stringbuf's characters are stored in a\n"
872 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
873 "buffer\n"
874 "@end table")
3ee86942
MV
875#define FUNC_NAME s_scm_sys_symbol_dump
876{
6ce6923b
MG
877 SCM e1, e2, e3, e4, e5, e6, e7, e8;
878 SCM buf;
3ee86942 879 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
880 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
881 sym);
882 e2 = scm_cons (scm_from_locale_symbol ("hash"),
883 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
884 e3 = scm_cons (scm_from_locale_symbol ("interned"),
885 scm_symbol_interned_p (sym));
6ce6923b
MG
886 buf = SYMBOL_STRINGBUF (sym);
887
888 /* Stringbuf info */
6ce6923b
MG
889 if (!STRINGBUF_WIDE (buf))
890 {
891 size_t len = STRINGBUF_LENGTH (buf);
892 char *cbuf;
893 SCM sbc = scm_i_make_string (len, &cbuf);
894 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
895 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
896 sbc);
897 }
9c44cd45 898 else
6ce6923b
MG
899 {
900 size_t len = STRINGBUF_LENGTH (buf);
901 scm_t_wchar *cbuf;
902 SCM sbc = scm_i_make_wide_string (len, &cbuf);
903 u32_cpy ((scm_t_uint32 *) cbuf,
904 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
905 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
906 sbc);
907 }
908 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
909 scm_from_size_t (STRINGBUF_LENGTH (buf)));
910 if (STRINGBUF_SHARED (buf))
911 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
912 SCM_BOOL_T);
913 else
914 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
915 SCM_BOOL_F);
916 if (STRINGBUF_INLINE (buf))
917 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
918 SCM_BOOL_T);
919 else
920 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
921 SCM_BOOL_F);
922 if (STRINGBUF_WIDE (buf))
923 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
924 SCM_BOOL_T);
925 else
926 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
927 SCM_BOOL_F);
928 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
929
3ee86942
MV
930}
931#undef FUNC_NAME
932
6ce6923b
MG
933#if SCM_STRING_LENGTH_HISTOGRAM
934
9c44cd45 935SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 936#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
937{
938 int i;
939 for (i = 0; i < 1000; i++)
940 if (lenhist[i])
941 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
942 fprintf (stderr, ">999: %u\n", lenhist[1000]);
943 return SCM_UNSPECIFIED;
be54b15d
DH
944}
945#undef FUNC_NAME
946
3ee86942
MV
947#endif
948
949\f
950
951SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
952 (SCM obj),
953 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
954#define FUNC_NAME s_scm_string_p
955{
956 return scm_from_bool (IS_STRING (obj));
957}
958#undef FUNC_NAME
959
960
961SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
962
963SCM_DEFINE (scm_string, "string", 0, 0, 1,
964 (SCM chrs),
965 "@deffnx {Scheme Procedure} list->string chrs\n"
966 "Return a newly allocated string composed of the arguments,\n"
967 "@var{chrs}.")
968#define FUNC_NAME s_scm_string
969{
970 SCM result;
9c44cd45 971 SCM rest;
3ee86942 972 size_t len;
9c44cd45
MG
973 size_t p = 0;
974 long i;
3ee86942 975
9c44cd45
MG
976 /* Verify that this is a list of chars. */
977 i = scm_ilength (chrs);
3c7cf7f5 978 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 979
9c44cd45
MG
980 len = (size_t) i;
981 rest = chrs;
3ee86942 982
9c44cd45 983 while (len > 0 && scm_is_pair (rest))
3ee86942 984 {
9c44cd45 985 SCM elt = SCM_CAR (rest);
3ee86942 986 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9c44cd45
MG
987 rest = SCM_CDR (rest);
988 len--;
989 scm_remember_upto_here_1 (elt);
990 }
991
992 /* Construct a string containing this list of chars. */
993 len = (size_t) i;
994 rest = chrs;
995
996 result = scm_i_make_string (len, NULL);
997 result = scm_i_string_start_writing (result);
998 while (len > 0 && scm_is_pair (rest))
999 {
1000 SCM elt = SCM_CAR (rest);
1001 scm_i_string_set_x (result, p, SCM_CHAR (elt));
1002 p++;
1003 rest = SCM_CDR (rest);
3ee86942 1004 len--;
9c44cd45 1005 scm_remember_upto_here_1 (elt);
3ee86942 1006 }
9c44cd45
MG
1007 scm_i_string_stop_writing ();
1008
3ee86942
MV
1009 if (len > 0)
1010 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1011 if (!scm_is_null (rest))
3ee86942
MV
1012 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1013
1014 return result;
1015}
1016#undef FUNC_NAME
be54b15d 1017
3b3b36dd 1018SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1019 (SCM k, SCM chr),
0d26a824
MG
1020 "Return a newly allocated string of\n"
1021 "length @var{k}. If @var{chr} is given, then all elements of\n"
1022 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1023 "of the @var{string} are unspecified.")
1bbd0b84 1024#define FUNC_NAME s_scm_make_string
0f2d19dd 1025{
3ee86942
MV
1026 return scm_c_make_string (scm_to_size_t (k), chr);
1027}
1028#undef FUNC_NAME
1029
1030SCM
1031scm_c_make_string (size_t len, SCM chr)
1032#define FUNC_NAME NULL
1033{
9c44cd45
MG
1034 size_t p;
1035 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1036
e11e83f3
MV
1037 if (!SCM_UNBNDP (chr))
1038 {
3ee86942 1039 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1040 res = scm_i_string_start_writing (res);
1041 for (p = 0; p < len; p++)
1042 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1043 scm_i_string_stop_writing ();
0f2d19dd 1044 }
e11e83f3
MV
1045
1046 return res;
0f2d19dd 1047}
1bbd0b84 1048#undef FUNC_NAME
0f2d19dd 1049
3b3b36dd 1050SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1051 (SCM string),
1052 "Return the number of characters in @var{string}.")
1bbd0b84 1053#define FUNC_NAME s_scm_string_length
0f2d19dd 1054{
d1ca2c64 1055 SCM_VALIDATE_STRING (1, string);
3ee86942 1056 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1057}
1bbd0b84 1058#undef FUNC_NAME
0f2d19dd 1059
9c44cd45
MG
1060SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
1061 (SCM string),
1062 "Return the bytes used to represent a character in @var{string}."
1063 "This will return 1 or 4.")
1064#define FUNC_NAME s_scm_string_width
1065{
1066 SCM_VALIDATE_STRING (1, string);
1067 if (!scm_i_is_narrow_string (string))
1068 return scm_from_int (4);
1069
1070 return scm_from_int (1);
1071}
1072#undef FUNC_NAME
1073
3ee86942
MV
1074size_t
1075scm_c_string_length (SCM string)
1076{
1077 if (!IS_STRING (string))
1078 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1079 return STRING_LENGTH (string);
1080}
1081
bd9e24b3 1082SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1083 (SCM str, SCM k),
9c44cd45
MG
1084 "Return character @var{k} of @var{str} using zero-origin\n"
1085 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1086#define FUNC_NAME s_scm_string_ref
0f2d19dd 1087{
3ae3166b 1088 size_t len;
a55c2b68 1089 unsigned long idx;
bd9e24b3 1090
d1ca2c64 1091 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1092
1093 len = scm_i_string_length (str);
1094 if (SCM_LIKELY (len > 0))
1095 idx = scm_to_unsigned_integer (k, 0, len - 1);
1096 else
1097 scm_out_of_range (NULL, k);
1098
9c44cd45
MG
1099 if (scm_i_is_narrow_string (str))
1100 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1101 else
1102 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1103}
1bbd0b84 1104#undef FUNC_NAME
0f2d19dd 1105
3ee86942
MV
1106SCM
1107scm_c_string_ref (SCM str, size_t p)
1108{
1109 if (p >= scm_i_string_length (str))
1110 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1111 if (scm_i_is_narrow_string (str))
1112 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1113 else
1114 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1115
3ee86942 1116}
f0942910 1117
3b3b36dd 1118SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1119 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1120 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1121 "an unspecified value. @var{k} must be a valid index of\n"
1122 "@var{str}.")
1bbd0b84 1123#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1124{
3ae3166b 1125 size_t len;
a55c2b68
MV
1126 unsigned long idx;
1127
f0942910 1128 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1129
1130 len = scm_i_string_length (str);
1131 if (SCM_LIKELY (len > 0))
1132 idx = scm_to_unsigned_integer (k, 0, len - 1);
1133 else
1134 scm_out_of_range (NULL, k);
1135
34d19ef6 1136 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1137 str = scm_i_string_start_writing (str);
1138 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1139 scm_i_string_stop_writing ();
1140
0f2d19dd
JB
1141 return SCM_UNSPECIFIED;
1142}
1bbd0b84 1143#undef FUNC_NAME
0f2d19dd 1144
3ee86942
MV
1145void
1146scm_c_string_set_x (SCM str, size_t p, SCM chr)
1147{
1148 if (p >= scm_i_string_length (str))
1149 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1150 str = scm_i_string_start_writing (str);
1151 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1152 scm_i_string_stop_writing ();
3ee86942 1153}
0f2d19dd 1154
3b3b36dd 1155SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1156 (SCM str, SCM start, SCM end),
1157 "Return a newly allocated string formed from the characters\n"
1158 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1159 "ending with index @var{end} (exclusive).\n"
1160 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1161 "exact integers satisfying:\n\n"
1162 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1163#define FUNC_NAME s_scm_substring
0f2d19dd 1164{
3ee86942 1165 size_t len, from, to;
685c0d71 1166
d1ca2c64 1167 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1168 len = scm_i_string_length (str);
1169 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1170 if (SCM_UNBNDP (end))
3ee86942 1171 to = len;
a55c2b68 1172 else
3ee86942
MV
1173 to = scm_to_unsigned_integer (end, from, len);
1174 return scm_i_substring (str, from, to);
0f2d19dd 1175}
1bbd0b84 1176#undef FUNC_NAME
0f2d19dd 1177
ed35de72
MV
1178SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1179 (SCM str, SCM start, SCM end),
1180 "Return a newly allocated string formed from the characters\n"
1181 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1182 "ending with index @var{end} (exclusive).\n"
1183 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1184 "exact integers satisfying:\n"
1185 "\n"
1186 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1187 "\n"
1188 "The returned string is read-only.\n")
1189#define FUNC_NAME s_scm_substring_read_only
1190{
1191 size_t len, from, to;
1192
1193 SCM_VALIDATE_STRING (1, str);
1194 len = scm_i_string_length (str);
1195 from = scm_to_unsigned_integer (start, 0, len);
1196 if (SCM_UNBNDP (end))
1197 to = len;
1198 else
1199 to = scm_to_unsigned_integer (end, from, len);
1200 return scm_i_substring_read_only (str, from, to);
1201}
1202#undef FUNC_NAME
1203
3ee86942
MV
1204SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1205 (SCM str, SCM start, SCM end),
1206 "Return a newly allocated string formed from the characters\n"
1207 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1208 "ending with index @var{end} (exclusive).\n"
1209 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1210 "exact integers satisfying:\n\n"
1211 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1212#define FUNC_NAME s_scm_substring_copy
1213{
e1b29f6a
MV
1214 /* For the Scheme version, START is mandatory, but for the C
1215 version, it is optional. See scm_string_copy in srfi-13.c for a
1216 rationale.
1217 */
1218
1219 size_t from, to;
3ee86942
MV
1220
1221 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1222 scm_i_get_substring_spec (scm_i_string_length (str),
1223 start, &from, end, &to);
3ee86942
MV
1224 return scm_i_substring_copy (str, from, to);
1225}
1226#undef FUNC_NAME
1227
1228SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1229 (SCM str, SCM start, SCM end),
1230 "Return string that indirectly refers to the characters\n"
1231 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1232 "ending with index @var{end} (exclusive).\n"
1233 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1234 "exact integers satisfying:\n\n"
1235 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1236#define FUNC_NAME s_scm_substring_shared
1237{
1238 size_t len, from, to;
1239
1240 SCM_VALIDATE_STRING (1, str);
1241 len = scm_i_string_length (str);
1242 from = scm_to_unsigned_integer (start, 0, len);
1243 if (SCM_UNBNDP (end))
1244 to = len;
1245 else
1246 to = scm_to_unsigned_integer (end, from, len);
1247 return scm_i_substring_shared (str, from, to);
1248}
1249#undef FUNC_NAME
685c0d71 1250
3b3b36dd 1251SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1252 (SCM args),
9c44cd45 1253 "Return a newly allocated string whose characters form the\n"
0d26a824 1254 "concatenation of the given strings, @var{args}.")
1bbd0b84 1255#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1256{
1257 SCM res;
9c44cd45
MG
1258 size_t len = 0;
1259 int wide = 0;
c829a427 1260 SCM l, s;
bd4911ef 1261 size_t i;
9909c395
MG
1262 union
1263 {
1264 char *narrow;
1265 scm_t_wchar *wide;
1266 } data;
af45e3b0
DH
1267
1268 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1269 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1270 {
1271 s = SCM_CAR (l);
1272 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1273 len += scm_i_string_length (s);
1274 if (!scm_i_is_narrow_string (s))
1275 wide = 1;
c829a427 1276 }
9909c395 1277 data.narrow = NULL;
9c44cd45 1278 if (!wide)
9909c395 1279 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1280 else
9909c395 1281 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1282
1283 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1284 {
edea856c 1285 size_t len;
c829a427 1286 s = SCM_CAR (l);
3ee86942 1287 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1288 len = scm_i_string_length (s);
9c44cd45
MG
1289 if (!wide)
1290 {
9909c395
MG
1291 memcpy (data.narrow, scm_i_string_chars (s), len);
1292 data.narrow += len;
9c44cd45
MG
1293 }
1294 else
1295 {
1296 if (scm_i_is_narrow_string (s))
1297 {
1298 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1299 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1300 }
1301 else
9909c395 1302 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1303 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1304 data.wide += len;
9c44cd45 1305 }
c829a427
MV
1306 scm_remember_upto_here_1 (s);
1307 }
0f2d19dd
JB
1308 return res;
1309}
1bbd0b84 1310#undef FUNC_NAME
0f2d19dd 1311
c829a427
MV
1312int
1313scm_is_string (SCM obj)
1314{
3ee86942 1315 return IS_STRING (obj);
c829a427 1316}
24933780 1317
c829a427
MV
1318SCM
1319scm_from_locale_stringn (const char *str, size_t len)
1320{
1321 SCM res;
1322 char *dst;
4d4528e7 1323
9c44cd45 1324 if (len == (size_t) -1)
c829a427 1325 len = strlen (str);
9c44cd45
MG
1326 if (len == 0)
1327 return scm_nullstr;
1328
3ee86942 1329 res = scm_i_make_string (len, &dst);
c829a427
MV
1330 memcpy (dst, str, len);
1331 return res;
1332}
4d4528e7 1333
c829a427
MV
1334SCM
1335scm_from_locale_string (const char *str)
4d4528e7 1336{
9c44cd45
MG
1337 if (str == NULL)
1338 return scm_nullstr;
1339
c829a427
MV
1340 return scm_from_locale_stringn (str, -1);
1341}
4d4528e7 1342
50b1996f
MG
1343/* Create a new scheme string from the C string STR. The memory of
1344 STR may be used directly as storage for the new string. */
c829a427
MV
1345SCM
1346scm_take_locale_stringn (char *str, size_t len)
1347{
48ddf0d9
KR
1348 SCM buf, res;
1349
9c44cd45 1350 if (len == (size_t) -1)
48ddf0d9 1351 len = strlen (str);
c829a427
MV
1352 else
1353 {
48ddf0d9
KR
1354 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1355 often be satisfied from the alignment padding after the block, with
1356 no actual data movement. */
9c44cd45 1357 str = scm_realloc (str, len + 1);
48ddf0d9 1358 str[len] = '\0';
c829a427 1359 }
c829a427 1360
fd0a5bbc 1361 buf = scm_i_take_stringbufn (str, len);
3ee86942 1362 res = scm_double_cell (STRING_TAG,
9c44cd45 1363 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
c829a427
MV
1364 return res;
1365}
1366
48ddf0d9
KR
1367SCM
1368scm_take_locale_string (char *str)
1369{
1370 return scm_take_locale_stringn (str, -1);
1371}
1372
9c44cd45
MG
1373/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1374 and \UXXXXXX. */
1375static void
1376unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1377{
1378 char *before, *after;
1379 size_t i, j;
1380
1381 before = *bufp;
1382 after = *bufp;
1383 i = 0;
1384 j = 0;
1385 while (i < *lenp)
1386 {
1387 if ((i <= *lenp - 6)
1388 && before[i] == '\\'
1389 && before[i + 1] == 'u'
1390 && before[i + 2] == '0' && before[i + 3] == '0')
1391 {
1392 /* Convert \u00NN to \xNN */
1393 after[j] = '\\';
1394 after[j + 1] = 'x';
30a6b9ca
MG
1395 after[j + 2] = tolower ((int) before[i + 4]);
1396 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1397 i += 6;
1398 j += 4;
1399 }
1400 else if ((i <= *lenp - 10)
1401 && before[i] == '\\'
1402 && before[i + 1] == 'U'
1403 && before[i + 2] == '0' && before[i + 3] == '0')
1404 {
1405 /* Convert \U00NNNNNN to \UNNNNNN */
1406 after[j] = '\\';
1407 after[j + 1] = 'U';
30a6b9ca
MG
1408 after[j + 2] = tolower ((int) before[i + 4]);
1409 after[j + 3] = tolower ((int) before[i + 5]);
1410 after[j + 4] = tolower ((int) before[i + 6]);
1411 after[j + 5] = tolower ((int) before[i + 7]);
1412 after[j + 6] = tolower ((int) before[i + 8]);
1413 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1414 i += 10;
1415 j += 8;
1416 }
1417 else
1418 {
1419 after[j] = before[i];
1420 i++;
1421 j++;
1422 }
1423 }
1424 *lenp = j;
1425 after = scm_realloc (after, j);
1426}
1427
c829a427 1428char *
9c44cd45 1429scm_to_locale_stringn (SCM str, size_t * lenp)
c829a427 1430{
9c44cd45
MG
1431 const char *enc;
1432
1433 /* In the future, enc will hold the port's encoding. */
1434 enc = NULL;
1435
eca29b02
MG
1436 return scm_to_stringn (str, lenp, enc,
1437 SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE);
9c44cd45
MG
1438}
1439
1440/* Low-level scheme to C string conversion function. */
1441char *
1442scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
eca29b02 1443 scm_t_string_failed_conversion_handler handler)
9c44cd45
MG
1444{
1445 static const char iso[11] = "ISO-8859-1";
1446 char *buf;
1447 size_t ilen, len, i;
4d4528e7 1448
3ee86942 1449 if (!scm_is_string (str))
c829a427 1450 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1451 ilen = scm_i_string_length (str);
1452
1453 if (ilen == 0)
1454 {
1455 buf = scm_malloc (1);
1456 buf[0] = '\0';
1457 if (lenp)
1458 *lenp = 0;
1459 return buf;
1460 }
1461
c829a427 1462 if (lenp == NULL)
9c44cd45
MG
1463 for (i = 0; i < ilen; i++)
1464 if (scm_i_string_ref (str, i) == '\0')
1465 scm_misc_error (NULL,
1466 "string contains #\\nul character: ~S",
1467 scm_list_1 (str));
1468
1469 if (scm_i_is_narrow_string (str))
c829a427 1470 {
9c44cd45
MG
1471 if (lenp)
1472 {
1473 buf = scm_malloc (ilen);
1474 memcpy (buf, scm_i_string_chars (str), ilen);
1475 *lenp = ilen;
1476 return buf;
1477 }
1478 else
1479 {
1480 buf = scm_malloc (ilen + 1);
1481 memcpy (buf, scm_i_string_chars (str), ilen);
1482 buf[ilen] = '\0';
1483 return buf;
1484 }
c829a427 1485 }
9c44cd45
MG
1486
1487
1488 buf = NULL;
1489 len = 0;
1490 buf = u32_conv_to_encoding (iso,
eca29b02 1491 (enum iconv_ilseq_handler) handler,
9c44cd45
MG
1492 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1493 ilen, NULL, NULL, &len);
1494 if (buf == NULL)
1495 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1496 scm_list_2 (scm_from_locale_string (iso), str));
1497
eca29b02 1498 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
9c44cd45
MG
1499 unistring_escapes_to_guile_escapes (&buf, &len);
1500
1501 if (lenp)
4d4528e7 1502 *lenp = len;
9c44cd45
MG
1503 else
1504 {
1505 buf = scm_realloc (buf, len + 1);
1506 buf[len] = '\0';
1507 }
24933780 1508
c829a427 1509 scm_remember_upto_here_1 (str);
9c44cd45 1510 return buf;
4d4528e7 1511}
af68e5e5 1512
c829a427
MV
1513char *
1514scm_to_locale_string (SCM str)
1515{
1516 return scm_to_locale_stringn (str, NULL);
1517}
af68e5e5 1518
c829a427
MV
1519size_t
1520scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1521{
1522 size_t len;
9c44cd45 1523 char *result = NULL;
3ee86942 1524 if (!scm_is_string (str))
c829a427 1525 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1526 result = scm_to_locale_stringn (str, &len);
1527
1528 memcpy (buf, result, (len > max_len) ? max_len : len);
1529 free (result);
1530
c829a427
MV
1531 scm_remember_upto_here_1 (str);
1532 return len;
1533}
af68e5e5 1534
3ee86942
MV
1535/* converts C scm_array of strings to SCM scm_list of strings. */
1536/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1537SCM
3ee86942
MV
1538scm_makfromstrs (int argc, char **argv)
1539{
1540 int i = argc;
1541 SCM lst = SCM_EOL;
1542 if (0 > i)
1543 for (i = 0; argv[i]; i++);
1544 while (i--)
1545 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1546 return lst;
1547}
1548
c829a427
MV
1549/* Return a newly allocated array of char pointers to each of the strings
1550 in args, with a terminating NULL pointer. */
1551
1552char **
1553scm_i_allocate_string_pointers (SCM list)
2a776823 1554#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 1555{
c829a427
MV
1556 char **result;
1557 int len = scm_ilength (list);
1558 int i;
1559
1560 if (len < 0)
1561 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1562
2a776823
LC
1563 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1564 "string pointers");
c829a427 1565 result[len] = NULL;
c829a427
MV
1566
1567 /* The list might be have been modified in another thread, so
1568 we check LIST before each access.
1569 */
d2e53ed6 1570 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427 1571 {
2a776823
LC
1572 SCM str;
1573 size_t len;
1574
1575 str = SCM_CAR (list);
1576 len = scm_c_string_length (str);
1577
1578 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1579 memcpy (result[i], scm_i_string_chars (str), len);
1580 result[i][len] = '\0';
1581
c829a427
MV
1582 list = SCM_CDR (list);
1583 }
1584
c829a427 1585 return result;
af68e5e5 1586}
2a776823 1587#undef FUNC_NAME
24933780 1588
6f14f578
MV
1589void
1590scm_i_get_substring_spec (size_t len,
1591 SCM start, size_t *cstart,
1592 SCM end, size_t *cend)
1593{
1594 if (SCM_UNBNDP (start))
1595 *cstart = 0;
1596 else
1597 *cstart = scm_to_unsigned_integer (start, 0, len);
1598
1599 if (SCM_UNBNDP (end))
1600 *cend = len;
1601 else
1602 *cend = scm_to_unsigned_integer (end, *cstart, len);
1603}
1604
3ee86942
MV
1605#if SCM_ENABLE_DEPRECATED
1606
556d75db
MV
1607/* When these definitions are removed, it becomes reasonable to use
1608 read-only strings for string literals. For that, change the reader
1609 to create string literals with scm_c_substring_read_only instead of
1610 with scm_c_substring_copy.
1611*/
1612
3ee86942 1613int
fe78c51a 1614scm_i_deprecated_stringp (SCM str)
3ee86942
MV
1615{
1616 scm_c_issue_deprecation_warning
1617 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1618
2616f0e0 1619 return scm_is_string (str);
3ee86942
MV
1620}
1621
1622char *
fe78c51a 1623scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
1624{
1625 char *chars;
1626
1627 scm_c_issue_deprecation_warning
1628 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1629
2616f0e0
MV
1630 /* We don't accept shared substrings here since they are not
1631 null-terminated.
1632 */
1633 if (IS_SH_STRING (str))
1634 scm_misc_error (NULL,
1635 "SCM_STRING_CHARS does not work with shared substrings.",
1636 SCM_EOL);
1637
877f06c3 1638 /* We explicitly test for read-only strings to produce a better
556d75db
MV
1639 error message.
1640 */
1641
1642 if (IS_RO_STRING (str))
1643 scm_misc_error (NULL,
1644 "SCM_STRING_CHARS does not work with read-only strings.",
1645 SCM_EOL);
1646
2616f0e0 1647 /* The following is still wrong, of course...
3ee86942 1648 */
9c44cd45 1649 str = scm_i_string_start_writing (str);
3ee86942
MV
1650 chars = scm_i_string_writable_chars (str);
1651 scm_i_string_stop_writing ();
1652 return chars;
1653}
1654
1655size_t
fe78c51a 1656scm_i_deprecated_string_length (SCM str)
3ee86942
MV
1657{
1658 scm_c_issue_deprecation_warning
1659 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1660 return scm_c_string_length (str);
1661}
1662
1663#endif
1664
0f2d19dd
JB
1665void
1666scm_init_strings ()
0f2d19dd 1667{
3ee86942 1668 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 1669
a0599745 1670#include "libguile/strings.x"
0f2d19dd
JB
1671}
1672
89e00824
ML
1673
1674/*
1675 Local Variables:
1676 c-file-style: "gnu"
1677 End:
1678*/