Some signed/unsigned comparison and conversions
[bpt/guile.git] / libguile / strings.c
CommitLineData
50b1996f 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45
MG
27#include <ctype.h>
28#include <unistr.h>
faf2c9d7 29
a0599745
MD
30#include "libguile/_scm.h"
31#include "libguile/chars.h"
7c33806a 32#include "libguile/root.h"
a0599745 33#include "libguile/strings.h"
1afff620 34#include "libguile/deprecation.h"
a0599745 35#include "libguile/validate.h"
c829a427 36#include "libguile/dynwind.h"
1afff620 37
0f2d19dd
JB
38\f
39
40/* {Strings}
41 */
42
3ee86942
MV
43
44/* Stringbufs
45 *
46 * XXX - keeping an accurate refcount during GC seems to be quite
47 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 48 * shared, not whether it definitely is.
3ee86942
MV
49 *
50 * The scheme I (mvo) tried to keep an accurate reference count would
51 * recount all strings that point to a stringbuf during the mark-phase
52 * of the GC. This was done since one cannot access the stringbuf of
53 * a string when that string is freed (in order to decrease the
54 * reference count). The memory of the stringbuf might have been
55 * reused already for something completely different.
56 *
57 * This recounted worked for a small number of threads beating on
58 * cow-strings, but it failed randomly with more than 10 threads, say.
59 * I couldn't figure out what went wrong, so I used the conservative
60 * approach implemented below.
61 *
62 * A stringbuf needs to know its length, but only so that it can be
63 * reported when the stringbuf is freed.
64 *
50b1996f
MG
65 * There are 3 storage strategies for stringbufs: inline, outline, and
66 * wide.
67 *
68 * Inline strings are small 8-bit strings stored within the double
69 * cell itself. Outline strings are larger 8-bit strings with GC
70 * allocated storage. Wide strings are 32-bit strings with allocated
71 * storage.
72 *
73 * There was little value in making wide string inlineable, since
74 * there is only room for three inlined 32-bit characters. Thus wide
75 * stringbufs are never inlined.
3ee86942
MV
76 */
77
78#define STRINGBUF_F_SHARED 0x100
79#define STRINGBUF_F_INLINE 0x200
50b1996f
MG
80#define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
81 encoding. Otherwise, strings
82 are Latin-1. */
3ee86942
MV
83
84#define STRINGBUF_TAG scm_tc7_stringbuf
85#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
86#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
9c44cd45 87#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942
MV
88
89#define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
90#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
91#define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
92#define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
93
94#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
95 ? STRINGBUF_INLINE_CHARS (buf) \
96 : STRINGBUF_OUTLINE_CHARS (buf))
50b1996f 97
9c44cd45 98#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
3ee86942
MV
99#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
100 ? STRINGBUF_INLINE_LENGTH (buf) \
101 : STRINGBUF_OUTLINE_LENGTH (buf))
102
103#define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
104
105#define SET_STRINGBUF_SHARED(buf) \
106 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
107
6ce6923b 108#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
109static size_t lenhist[1001];
110#endif
111
50b1996f
MG
112/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
113 characters. */
3ee86942
MV
114static SCM
115make_stringbuf (size_t len)
0f2d19dd 116{
3ee86942
MV
117 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
118 scm_i_symbol_chars, all stringbufs are null-terminated. Once
119 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
120 has been changed for scm_i_symbol_chars, this null-termination
121 can be dropped.
122 */
123
6ce6923b 124#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
125 if (len < 1000)
126 lenhist[len]++;
127 else
128 lenhist[1000]++;
129#endif
0f2d19dd 130
3ee86942
MV
131 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
132 {
133 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
134 0, 0, 0);
135 }
136 else
137 {
138 char *mem = scm_gc_malloc (len+1, "string");
139 mem[len] = '\0';
140 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
141 (scm_t_bits) len, (scm_t_bits) 0);
142 }
143}
e53cc817 144
50b1996f
MG
145/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
146 characters. */
9c44cd45
MG
147static SCM
148make_wide_stringbuf (size_t len)
149{
150 scm_t_wchar *mem;
6ce6923b 151#if SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156#endif
157
158 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
159 mem[len] = 0;
160 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
161 (scm_t_bits) len, (scm_t_bits) 0);
162}
163
2b829bbb
KR
164/* Return a new stringbuf whose underlying storage consists of the LEN+1
165 octets pointed to by STR (the last octet is zero). */
7f74cf9a 166SCM
fd0a5bbc
HWN
167scm_i_take_stringbufn (char *str, size_t len)
168{
2b829bbb 169 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
fd0a5bbc
HWN
170
171 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
172 (scm_t_bits) len, (scm_t_bits) 0);
173}
174
3ee86942
MV
175SCM
176scm_i_stringbuf_mark (SCM buf)
177{
178 return SCM_BOOL_F;
179}
1bbd0b84 180
3ee86942
MV
181void
182scm_i_stringbuf_free (SCM buf)
0f2d19dd 183{
3ee86942 184 if (!STRINGBUF_INLINE (buf))
9c44cd45
MG
185 {
186 if (!STRINGBUF_WIDE (buf))
187 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
188 STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
189 else
190 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
191 sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
192 + 1), "string");
193 }
194
195}
196
50b1996f
MG
197/* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
198 one containing 32-bit UCS-4-encoded characters. */
9c44cd45
MG
199static void
200widen_stringbuf (SCM buf)
201{
202 size_t i, len;
203 scm_t_wchar *mem;
204
205 if (STRINGBUF_WIDE (buf))
206 return;
207
208 if (STRINGBUF_INLINE (buf))
209 {
210 len = STRINGBUF_INLINE_LENGTH (buf);
211
212 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
213 for (i = 0; i < len; i++)
214 mem[i] =
215 (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
216 mem[len] = 0;
217
218 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
219 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
220 SCM_SET_CELL_WORD_1 (buf, mem);
221 SCM_SET_CELL_WORD_2 (buf, len);
222 }
223 else
224 {
225 len = STRINGBUF_OUTLINE_LENGTH (buf);
226
227 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
228 for (i = 0; i < len; i++)
229 mem[i] =
230 (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
231 mem[len] = 0;
232
233 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
234
235 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
236 SCM_SET_CELL_WORD_1 (buf, mem);
237 SCM_SET_CELL_WORD_2 (buf, len);
238 }
3ee86942 239}
bd9e24b3 240
9de87eea 241scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 242
3ee86942
MV
243/* Copy-on-write strings.
244 */
bd9e24b3 245
3ee86942 246#define STRING_TAG scm_tc7_string
bd9e24b3 247
3ee86942
MV
248#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
249#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
250#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 251
3ee86942
MV
252#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
253#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
254
255#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
256
ed35de72
MV
257/* Read-only strings.
258 */
259
260#define RO_STRING_TAG (scm_tc7_string + 0x200)
261#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
262
e1b29f6a
MV
263/* Mutation-sharing substrings
264 */
265
266#define SH_STRING_TAG (scm_tc7_string + 0x100)
267
268#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
269/* START and LENGTH as for STRINGs. */
270
271#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
272
50b1996f
MG
273/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
274 characters. CHARSP, if not NULL, will be set to location of the
275 char array. */
3ee86942
MV
276SCM
277scm_i_make_string (size_t len, char **charsp)
278{
279 SCM buf = make_stringbuf (len);
280 SCM res;
281 if (charsp)
282 *charsp = STRINGBUF_CHARS (buf);
283 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
284 (scm_t_bits)0, (scm_t_bits) len);
285 return res;
0f2d19dd
JB
286}
287
50b1996f
MG
288/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
289 characters. CHARSP, if not NULL, will be set to location of the
290 character array. */
9c44cd45 291SCM
50b1996f 292scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
293{
294 SCM buf = make_wide_stringbuf (len);
295 SCM res;
296 if (charsp)
297 *charsp = STRINGBUF_WIDE_CHARS (buf);
298 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
299 (scm_t_bits) 0, (scm_t_bits) len);
300 return res;
301}
302
3ee86942
MV
303static void
304validate_substring_args (SCM str, size_t start, size_t end)
305{
306 if (!IS_STRING (str))
307 scm_wrong_type_arg_msg (NULL, 0, str, "string");
308 if (start > STRING_LENGTH (str))
309 scm_out_of_range (NULL, scm_from_size_t (start));
310 if (end > STRING_LENGTH (str) || end < start)
311 scm_out_of_range (NULL, scm_from_size_t (end));
312}
0f2d19dd 313
e1b29f6a
MV
314static inline void
315get_str_buf_start (SCM *str, SCM *buf, size_t *start)
316{
317 *start = STRING_START (*str);
318 if (IS_SH_STRING (*str))
319 {
320 *str = SH_STRING_STRING (*str);
321 *start += STRING_START (*str);
322 }
323 *buf = STRING_STRINGBUF (*str);
324}
325
3ee86942
MV
326SCM
327scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 328{
e1b29f6a
MV
329 SCM buf;
330 size_t str_start;
331 get_str_buf_start (&str, &buf, &str_start);
9de87eea 332 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 333 SET_STRINGBUF_SHARED (buf);
9de87eea 334 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 335 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
336 (scm_t_bits)str_start + start,
337 (scm_t_bits) end - start);
0f2d19dd
JB
338}
339
ed35de72
MV
340SCM
341scm_i_substring_read_only (SCM str, size_t start, size_t end)
342{
45a9f430
LC
343 SCM buf;
344 size_t str_start;
345 get_str_buf_start (&str, &buf, &str_start);
346 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
347 SET_STRINGBUF_SHARED (buf);
348 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
349 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
350 (scm_t_bits)str_start + start,
351 (scm_t_bits) end - start);
ed35de72
MV
352}
353
3ee86942
MV
354SCM
355scm_i_substring_copy (SCM str, size_t start, size_t end)
356{
357 size_t len = end - start;
edea856c 358 SCM buf, my_buf;
e1b29f6a
MV
359 size_t str_start;
360 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
361 if (scm_i_is_narrow_string (str))
362 {
363 my_buf = make_stringbuf (len);
364 memcpy (STRINGBUF_CHARS (my_buf),
365 STRINGBUF_CHARS (buf) + str_start + start, len);
366 }
367 else
368 {
369 my_buf = make_wide_stringbuf (len);
370 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
371 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
372 + start), len);
373 /* Even though this string is wide, the substring may be narrow.
50b1996f 374 Consider adding code to narrow the string. */
9c44cd45 375 }
3ee86942 376 scm_remember_upto_here_1 (buf);
9c44cd45
MG
377 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
378 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 379}
0f2d19dd 380
e1b29f6a
MV
381SCM
382scm_i_substring_shared (SCM str, size_t start, size_t end)
383{
384 if (start == 0 && end == STRING_LENGTH (str))
385 return str;
386 else
387 {
388 size_t len = end - start;
389 if (IS_SH_STRING (str))
390 {
391 start += STRING_START (str);
392 str = SH_STRING_STRING (str);
393 }
394 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
395 (scm_t_bits)start, (scm_t_bits) len);
396 }
397}
398
3ee86942
MV
399SCM
400scm_c_substring (SCM str, size_t start, size_t end)
401{
402 validate_substring_args (str, start, end);
403 return scm_i_substring (str, start, end);
404}
ee149d03 405
ed35de72
MV
406SCM
407scm_c_substring_read_only (SCM str, size_t start, size_t end)
408{
409 validate_substring_args (str, start, end);
410 return scm_i_substring_read_only (str, start, end);
411}
412
0f2d19dd 413SCM
3ee86942 414scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 415{
3ee86942
MV
416 validate_substring_args (str, start, end);
417 return scm_i_substring_copy (str, start, end);
418}
419
3ee86942
MV
420SCM
421scm_c_substring_shared (SCM str, size_t start, size_t end)
422{
423 validate_substring_args (str, start, end);
424 return scm_i_substring_shared (str, start, end);
425}
0f2d19dd 426
ee149d03 427SCM
3ee86942 428scm_i_string_mark (SCM str)
ee149d03 429{
3ee86942
MV
430 if (IS_SH_STRING (str))
431 return SH_STRING_STRING (str);
432 else
433 return STRING_STRINGBUF (str);
ee149d03
JB
434}
435
3ee86942
MV
436void
437scm_i_string_free (SCM str)
438{
439}
36284627 440
3ee86942
MV
441/* Internal accessors
442 */
443
50b1996f
MG
444/* Returns the number of characters in STR. This may be different
445 than the memory size of the string storage. */
3ee86942
MV
446size_t
447scm_i_string_length (SCM str)
0f2d19dd 448{
3ee86942 449 return STRING_LENGTH (str);
0f2d19dd
JB
450}
451
50b1996f
MG
452/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
453 encoding. False if it is 'wide', having a 32-bit UCS-4
454 encoding. */
9c44cd45
MG
455int
456scm_i_is_narrow_string (SCM str)
457{
458 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
459}
460
50b1996f
MG
461/* Returns a pointer to the 8-bit Latin-1 encoded character array of
462 STR. */
3ee86942
MV
463const char *
464scm_i_string_chars (SCM str)
465{
466 SCM buf;
e1b29f6a
MV
467 size_t start;
468 get_str_buf_start (&str, &buf, &start);
9c44cd45
MG
469 if (scm_i_is_narrow_string (str))
470 return STRINGBUF_CHARS (buf) + start;
471 else
472 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
473 scm_list_1 (str));
474 return NULL;
3ee86942 475}
b00418df 476
50b1996f
MG
477/* Returns a pointer to the 32-bit UCS-4 encoded character array of
478 STR. */
9c44cd45
MG
479const scm_t_wchar *
480scm_i_string_wide_chars (SCM str)
481{
482 SCM buf;
483 size_t start;
484
485 get_str_buf_start (&str, &buf, &start);
486 if (!scm_i_is_narrow_string (str))
487 return STRINGBUF_WIDE_CHARS (buf) + start;
488 else
489 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
490 scm_list_1 (str));
491}
492
493/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
494 a new string buffer, so that it can be modified without modifying
50b1996f
MG
495 other strings. Also, lock the string mutex. Later, one must call
496 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
497SCM
498scm_i_string_start_writing (SCM orig_str)
b00418df 499{
ed35de72 500 SCM buf, str = orig_str;
e1b29f6a 501 size_t start;
ed35de72 502
e1b29f6a 503 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
504 if (IS_RO_STRING (str))
505 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
506
9de87eea 507 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
508 if (STRINGBUF_SHARED (buf))
509 {
9c44cd45 510 /* Clone the stringbuf. */
3ee86942
MV
511 size_t len = STRING_LENGTH (str);
512 SCM new_buf;
513
9de87eea 514 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 515
9c44cd45
MG
516 if (scm_i_is_narrow_string (str))
517 {
518 new_buf = make_stringbuf (len);
519 memcpy (STRINGBUF_CHARS (new_buf),
520 STRINGBUF_CHARS (buf) + STRING_START (str), len);
521
522 }
523 else
524 {
525 new_buf = make_wide_stringbuf (len);
526 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
527 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
528 + STRING_START (str)), len);
529 }
3ee86942
MV
530 scm_i_thread_put_to_sleep ();
531 SET_STRING_STRINGBUF (str, new_buf);
532 start -= STRING_START (str);
533 SET_STRING_START (str, 0);
534 scm_i_thread_wake_up ();
535
536 buf = new_buf;
537
9de87eea 538 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 539 }
9c44cd45
MG
540 return orig_str;
541}
542
50b1996f 543/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
544char *
545scm_i_string_writable_chars (SCM str)
546{
547 SCM buf;
548 size_t start;
3ee86942 549
9c44cd45
MG
550 get_str_buf_start (&str, &buf, &start);
551 if (scm_i_is_narrow_string (str))
552 return STRINGBUF_CHARS (buf) + start;
553 else
554 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
555 scm_list_1 (str));
556 return NULL;
557}
558
50b1996f 559/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
560static scm_t_wchar *
561scm_i_string_writable_wide_chars (SCM str)
562{
563 SCM buf;
564 size_t start;
565
566 get_str_buf_start (&str, &buf, &start);
567 if (!scm_i_is_narrow_string (str))
568 return STRINGBUF_WIDE_CHARS (buf) + start;
569 else
570 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
571 scm_list_1 (str));
b00418df
DH
572}
573
50b1996f
MG
574/* Unlock the string mutex that was locked when
575 scm_i_string_start_writing was called. */
3ee86942
MV
576void
577scm_i_string_stop_writing (void)
578{
9de87eea 579 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 580}
b00418df 581
50b1996f 582/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
583scm_t_wchar
584scm_i_string_ref (SCM str, size_t x)
585{
586 if (scm_i_is_narrow_string (str))
587 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
588 else
589 return scm_i_string_wide_chars (str)[x];
590}
591
50b1996f 592/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
593void
594scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
595{
596 if (chr > 0xFF && scm_i_is_narrow_string (str))
597 widen_stringbuf (STRING_STRINGBUF (str));
598
599 if (scm_i_is_narrow_string (str))
600 {
601 char *dst = scm_i_string_writable_chars (str);
602 dst[p] = (char) (unsigned char) chr;
603 }
604 else
605 {
606 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
607 dst[p] = chr;
608 }
609}
610
3ee86942
MV
611/* Symbols.
612
613 Basic symbol creation and accessing is done here, the rest is in
614 symbols.[hc]. This has been done to keep stringbufs and the
615 internals of strings and string-like objects confined to this file.
616*/
617
618#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
619
620SCM
6869328b
MV
621scm_i_make_symbol (SCM name, scm_t_bits flags,
622 unsigned long hash, SCM props)
3ee86942
MV
623{
624 SCM buf;
625 size_t start = STRING_START (name);
626 size_t length = STRING_LENGTH (name);
627
628 if (IS_SH_STRING (name))
629 {
630 name = SH_STRING_STRING (name);
631 start += STRING_START (name);
632 }
633 buf = SYMBOL_STRINGBUF (name);
634
635 if (start == 0 && length == STRINGBUF_LENGTH (buf))
636 {
637 /* reuse buf. */
9de87eea 638 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 639 SET_STRINGBUF_SHARED (buf);
9de87eea 640 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
641 }
642 else
643 {
644 /* make new buf. */
9c44cd45
MG
645 if (scm_i_is_narrow_string (name))
646 {
647 SCM new_buf = make_stringbuf (length);
648 memcpy (STRINGBUF_CHARS (new_buf),
649 STRINGBUF_CHARS (buf) + start, length);
650 buf = new_buf;
651 }
652 else
653 {
654 SCM new_buf = make_wide_stringbuf (length);
655 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
656 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
657 length);
658 buf = new_buf;
659 }
3ee86942 660 }
6869328b 661 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
662 (scm_t_bits) hash, SCM_UNPACK (props));
663}
664
fd0a5bbc
HWN
665SCM
666scm_i_c_make_symbol (const char *name, size_t len,
667 scm_t_bits flags, unsigned long hash, SCM props)
668{
669 SCM buf = make_stringbuf (len);
670 memcpy (STRINGBUF_CHARS (buf), name, len);
671
672 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
673 (scm_t_bits) hash, SCM_UNPACK (props));
674}
675
676/* Return a new symbol that uses the LEN bytes pointed to by NAME as its
677 underlying storage. */
678SCM
679scm_i_c_take_symbol (char *name, size_t len,
680 scm_t_bits flags, unsigned long hash, SCM props)
681{
682 SCM buf = scm_i_take_stringbufn (name, len);
683
684 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
685 (scm_t_bits) hash, SCM_UNPACK (props));
686}
687
50b1996f
MG
688/* Returns the number of characters in SYM. This may be different
689 from the memory size of SYM. */
3ee86942
MV
690size_t
691scm_i_symbol_length (SCM sym)
0f2d19dd 692{
3ee86942 693 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
694}
695
071bb6a8
LC
696size_t
697scm_c_symbol_length (SCM sym)
698#define FUNC_NAME "scm_c_symbol_length"
699{
700 SCM_VALIDATE_SYMBOL (1, sym);
701
702 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
703}
704#undef FUNC_NAME
705
50b1996f
MG
706/* True if the name of SYM is stored as a Latin-1 encoded string.
707 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
708int
709scm_i_is_narrow_symbol (SCM sym)
710{
711 SCM buf;
712
713 buf = SYMBOL_STRINGBUF (sym);
714 return !STRINGBUF_WIDE (buf);
715}
716
50b1996f
MG
717/* Returns a pointer to the 8-bit Latin-1 encoded character array that
718 contains the name of SYM. */
3ee86942
MV
719const char *
720scm_i_symbol_chars (SCM sym)
721{
9c44cd45
MG
722 SCM buf;
723
724 buf = SYMBOL_STRINGBUF (sym);
725 if (!STRINGBUF_WIDE (buf))
726 return STRINGBUF_CHARS (buf);
727 else
728 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
729 scm_list_1 (sym));
730}
731
50b1996f
MG
732/* Return a pointer to the 32-bit UCS-4-encoded character array of a
733 symbol's name. */
9c44cd45
MG
734const scm_t_wchar *
735scm_i_symbol_wide_chars (SCM sym)
736{
737 SCM buf;
738
739 buf = SYMBOL_STRINGBUF (sym);
740 if (STRINGBUF_WIDE (buf))
741 return STRINGBUF_WIDE_CHARS (buf);
742 else
743 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
744 scm_list_1 (sym));
3ee86942 745}
1cc91f1b 746
3ee86942
MV
747SCM
748scm_i_symbol_mark (SCM sym)
0f2d19dd 749{
3ee86942
MV
750 scm_gc_mark (SYMBOL_STRINGBUF (sym));
751 return SCM_CELL_OBJECT_3 (sym);
0f2d19dd
JB
752}
753
3ee86942
MV
754void
755scm_i_symbol_free (SCM sym)
756{
757}
0f2d19dd 758
be54b15d 759SCM
3ee86942 760scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 761{
3ee86942 762 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 763 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 764 SET_STRINGBUF_SHARED (buf);
9de87eea 765 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 766 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
767 (scm_t_bits)start, (scm_t_bits) end - start);
768}
be54b15d 769
50b1996f 770/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
771scm_t_wchar
772scm_i_symbol_ref (SCM sym, size_t x)
773{
774 if (scm_i_is_narrow_symbol (sym))
775 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
776 else
777 return scm_i_symbol_wide_chars (sym)[x];
778}
779
3ee86942
MV
780/* Debugging
781 */
be54b15d 782
6ce6923b
MG
783SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
784 "Returns an association list containing debugging information\n"
785 "for @var{str}. The association list has the following entries."
786 "@table @code\n"
787 "@item string\n"
788 "The string itself.\n"
789 "@item start\n"
790 "The start index of the string into its stringbuf\n"
791 "@item length\n"
792 "The length of the string\n"
793 "@item shared\n"
794 "If this string is a substring, it returns its parent string.\n"
795 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
796 "@item read-only\n"
797 "@code{#t} if the string is read-only\n"
6ce6923b
MG
798 "@item stringbuf-chars\n"
799 "A new string containing this string's stringbuf's characters\n"
800 "@item stringbuf-length\n"
801 "The number of characters in this stringbuf\n"
802 "@item stringbuf-shared\n"
803 "@code{#t} if this stringbuf is shared\n"
804 "@item stringbuf-inline\n"
805 "@code{#t} if this stringbuf's characters are stored in the\n"
806 "cell itself, or @code{#f} if they were allocated in memory\n"
807 "@item stringbuf-wide\n"
808 "@code{#t} if this stringbuf's characters are stored in a\n"
809 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
810 "buffer\n"
811 "@end table")
3ee86942
MV
812#define FUNC_NAME s_scm_sys_string_dump
813{
6ce6923b
MG
814 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
815 SCM buf;
3ee86942 816 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
817
818 /* String info */
819 e1 = scm_cons (scm_from_locale_symbol ("string"),
820 str);
821 e2 = scm_cons (scm_from_locale_symbol ("start"),
822 scm_from_size_t (STRING_START (str)));
823 e3 = scm_cons (scm_from_locale_symbol ("length"),
824 scm_from_size_t (STRING_LENGTH (str)));
825
3ee86942
MV
826 if (IS_SH_STRING (str))
827 {
6ce6923b
MG
828 e4 = scm_cons (scm_from_locale_symbol ("shared"),
829 SH_STRING_STRING (str));
830 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
831 }
832 else
833 {
6ce6923b
MG
834 e4 = scm_cons (scm_from_locale_symbol ("shared"),
835 SCM_BOOL_F);
836 buf = STRING_STRINGBUF (str);
837 }
9c44cd45 838
88ed5759
MG
839 if (IS_RO_STRING (str))
840 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
841 SCM_BOOL_T);
842 else
843 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
844 SCM_BOOL_F);
845
6ce6923b 846 /* Stringbuf info */
6ce6923b
MG
847 if (!STRINGBUF_WIDE (buf))
848 {
849 size_t len = STRINGBUF_LENGTH (buf);
850 char *cbuf;
851 SCM sbc = scm_i_make_string (len, &cbuf);
852 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
853 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
854 sbc);
3ee86942 855 }
6ce6923b
MG
856 else
857 {
858 size_t len = STRINGBUF_LENGTH (buf);
859 scm_t_wchar *cbuf;
860 SCM sbc = scm_i_make_wide_string (len, &cbuf);
861 u32_cpy ((scm_t_uint32 *) cbuf,
862 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
863 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
864 sbc);
865 }
866 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
867 scm_from_size_t (STRINGBUF_LENGTH (buf)));
868 if (STRINGBUF_SHARED (buf))
869 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
870 SCM_BOOL_T);
871 else
872 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
873 SCM_BOOL_F);
874 if (STRINGBUF_INLINE (buf))
875 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
876 SCM_BOOL_T);
877 else
878 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
879 SCM_BOOL_F);
880 if (STRINGBUF_WIDE (buf))
881 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
882 SCM_BOOL_T);
883 else
884 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
885 SCM_BOOL_F);
886
887 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
3ee86942
MV
888}
889#undef FUNC_NAME
890
6ce6923b
MG
891SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
892 "Returns an association list containing debugging information\n"
893 "for @var{sym}. The association list has the following entries."
894 "@table @code\n"
895 "@item symbol\n"
896 "The symbol itself\n"
897 "@item hash\n"
898 "Its hash value\n"
88ed5759
MG
899 "@item interned\n"
900 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
901 "@item stringbuf-chars\n"
902 "A new string containing this symbols's stringbuf's characters\n"
903 "@item stringbuf-length\n"
904 "The number of characters in this stringbuf\n"
905 "@item stringbuf-shared\n"
906 "@code{#t} if this stringbuf is shared\n"
907 "@item stringbuf-inline\n"
908 "@code{#t} if this stringbuf's characters are stored in the\n"
909 "cell itself, or @code{#f} if they were allocated in memory\n"
910 "@item stringbuf-wide\n"
911 "@code{#t} if this stringbuf's characters are stored in a\n"
912 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
913 "buffer\n"
914 "@end table")
3ee86942
MV
915#define FUNC_NAME s_scm_sys_symbol_dump
916{
6ce6923b
MG
917 SCM e1, e2, e3, e4, e5, e6, e7, e8;
918 SCM buf;
3ee86942 919 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
920 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
921 sym);
922 e2 = scm_cons (scm_from_locale_symbol ("hash"),
923 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
924 e3 = scm_cons (scm_from_locale_symbol ("interned"),
925 scm_symbol_interned_p (sym));
6ce6923b
MG
926 buf = SYMBOL_STRINGBUF (sym);
927
928 /* Stringbuf info */
6ce6923b
MG
929 if (!STRINGBUF_WIDE (buf))
930 {
931 size_t len = STRINGBUF_LENGTH (buf);
932 char *cbuf;
933 SCM sbc = scm_i_make_string (len, &cbuf);
934 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
935 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
936 sbc);
937 }
9c44cd45 938 else
6ce6923b
MG
939 {
940 size_t len = STRINGBUF_LENGTH (buf);
941 scm_t_wchar *cbuf;
942 SCM sbc = scm_i_make_wide_string (len, &cbuf);
943 u32_cpy ((scm_t_uint32 *) cbuf,
944 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
945 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
946 sbc);
947 }
948 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
949 scm_from_size_t (STRINGBUF_LENGTH (buf)));
950 if (STRINGBUF_SHARED (buf))
951 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
952 SCM_BOOL_T);
953 else
954 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
955 SCM_BOOL_F);
956 if (STRINGBUF_INLINE (buf))
957 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
958 SCM_BOOL_T);
959 else
960 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
961 SCM_BOOL_F);
962 if (STRINGBUF_WIDE (buf))
963 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
964 SCM_BOOL_T);
965 else
966 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
967 SCM_BOOL_F);
968 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
969
3ee86942
MV
970}
971#undef FUNC_NAME
972
6ce6923b
MG
973#if SCM_STRING_LENGTH_HISTOGRAM
974
9c44cd45 975SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 976#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
977{
978 int i;
979 for (i = 0; i < 1000; i++)
980 if (lenhist[i])
981 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
982 fprintf (stderr, ">999: %u\n", lenhist[1000]);
983 return SCM_UNSPECIFIED;
be54b15d
DH
984}
985#undef FUNC_NAME
986
3ee86942
MV
987#endif
988
989\f
990
991SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
992 (SCM obj),
993 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
994#define FUNC_NAME s_scm_string_p
995{
996 return scm_from_bool (IS_STRING (obj));
997}
998#undef FUNC_NAME
999
1000
1001SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1002
1003SCM_DEFINE (scm_string, "string", 0, 0, 1,
1004 (SCM chrs),
1005 "@deffnx {Scheme Procedure} list->string chrs\n"
1006 "Return a newly allocated string composed of the arguments,\n"
1007 "@var{chrs}.")
1008#define FUNC_NAME s_scm_string
1009{
1010 SCM result;
9c44cd45 1011 SCM rest;
3ee86942 1012 size_t len;
9c44cd45
MG
1013 size_t p = 0;
1014 long i;
3ee86942 1015
9c44cd45
MG
1016 /* Verify that this is a list of chars. */
1017 i = scm_ilength (chrs);
1018 len = (size_t) i;
1019 rest = chrs;
3ee86942 1020
9c44cd45
MG
1021 SCM_ASSERT (len >= 0, chrs, SCM_ARG1, FUNC_NAME);
1022 while (len > 0 && scm_is_pair (rest))
3ee86942 1023 {
9c44cd45 1024 SCM elt = SCM_CAR (rest);
3ee86942 1025 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9c44cd45
MG
1026 rest = SCM_CDR (rest);
1027 len--;
1028 scm_remember_upto_here_1 (elt);
1029 }
1030
1031 /* Construct a string containing this list of chars. */
1032 len = (size_t) i;
1033 rest = chrs;
1034
1035 result = scm_i_make_string (len, NULL);
1036 result = scm_i_string_start_writing (result);
1037 while (len > 0 && scm_is_pair (rest))
1038 {
1039 SCM elt = SCM_CAR (rest);
1040 scm_i_string_set_x (result, p, SCM_CHAR (elt));
1041 p++;
1042 rest = SCM_CDR (rest);
3ee86942 1043 len--;
9c44cd45 1044 scm_remember_upto_here_1 (elt);
3ee86942 1045 }
9c44cd45
MG
1046 scm_i_string_stop_writing ();
1047
3ee86942
MV
1048 if (len > 0)
1049 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1050 if (!scm_is_null (rest))
3ee86942
MV
1051 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1052
1053 return result;
1054}
1055#undef FUNC_NAME
be54b15d 1056
3b3b36dd 1057SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1058 (SCM k, SCM chr),
0d26a824
MG
1059 "Return a newly allocated string of\n"
1060 "length @var{k}. If @var{chr} is given, then all elements of\n"
1061 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1062 "of the @var{string} are unspecified.")
1bbd0b84 1063#define FUNC_NAME s_scm_make_string
0f2d19dd 1064{
3ee86942
MV
1065 return scm_c_make_string (scm_to_size_t (k), chr);
1066}
1067#undef FUNC_NAME
1068
1069SCM
1070scm_c_make_string (size_t len, SCM chr)
1071#define FUNC_NAME NULL
1072{
9c44cd45
MG
1073 size_t p;
1074 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1075
e11e83f3
MV
1076 if (!SCM_UNBNDP (chr))
1077 {
3ee86942 1078 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1079 res = scm_i_string_start_writing (res);
1080 for (p = 0; p < len; p++)
1081 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1082 scm_i_string_stop_writing ();
0f2d19dd 1083 }
e11e83f3
MV
1084
1085 return res;
0f2d19dd 1086}
1bbd0b84 1087#undef FUNC_NAME
0f2d19dd 1088
3b3b36dd 1089SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1090 (SCM string),
1091 "Return the number of characters in @var{string}.")
1bbd0b84 1092#define FUNC_NAME s_scm_string_length
0f2d19dd 1093{
d1ca2c64 1094 SCM_VALIDATE_STRING (1, string);
3ee86942 1095 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1096}
1bbd0b84 1097#undef FUNC_NAME
0f2d19dd 1098
9c44cd45
MG
1099SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
1100 (SCM string),
1101 "Return the bytes used to represent a character in @var{string}."
1102 "This will return 1 or 4.")
1103#define FUNC_NAME s_scm_string_width
1104{
1105 SCM_VALIDATE_STRING (1, string);
1106 if (!scm_i_is_narrow_string (string))
1107 return scm_from_int (4);
1108
1109 return scm_from_int (1);
1110}
1111#undef FUNC_NAME
1112
3ee86942
MV
1113size_t
1114scm_c_string_length (SCM string)
1115{
1116 if (!IS_STRING (string))
1117 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1118 return STRING_LENGTH (string);
1119}
1120
bd9e24b3 1121SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1122 (SCM str, SCM k),
9c44cd45
MG
1123 "Return character @var{k} of @var{str} using zero-origin\n"
1124 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1125#define FUNC_NAME s_scm_string_ref
0f2d19dd 1126{
3ae3166b 1127 size_t len;
a55c2b68 1128 unsigned long idx;
bd9e24b3 1129
d1ca2c64 1130 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1131
1132 len = scm_i_string_length (str);
1133 if (SCM_LIKELY (len > 0))
1134 idx = scm_to_unsigned_integer (k, 0, len - 1);
1135 else
1136 scm_out_of_range (NULL, k);
1137
9c44cd45
MG
1138 if (scm_i_is_narrow_string (str))
1139 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1140 else
1141 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1142}
1bbd0b84 1143#undef FUNC_NAME
0f2d19dd 1144
3ee86942
MV
1145SCM
1146scm_c_string_ref (SCM str, size_t p)
1147{
1148 if (p >= scm_i_string_length (str))
1149 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1150 if (scm_i_is_narrow_string (str))
1151 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1152 else
1153 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1154
3ee86942 1155}
f0942910 1156
3b3b36dd 1157SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1158 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1159 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1160 "an unspecified value. @var{k} must be a valid index of\n"
1161 "@var{str}.")
1bbd0b84 1162#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1163{
3ae3166b 1164 size_t len;
a55c2b68
MV
1165 unsigned long idx;
1166
f0942910 1167 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1168
1169 len = scm_i_string_length (str);
1170 if (SCM_LIKELY (len > 0))
1171 idx = scm_to_unsigned_integer (k, 0, len - 1);
1172 else
1173 scm_out_of_range (NULL, k);
1174
34d19ef6 1175 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1176 str = scm_i_string_start_writing (str);
1177 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1178 scm_i_string_stop_writing ();
1179
0f2d19dd
JB
1180 return SCM_UNSPECIFIED;
1181}
1bbd0b84 1182#undef FUNC_NAME
0f2d19dd 1183
3ee86942
MV
1184void
1185scm_c_string_set_x (SCM str, size_t p, SCM chr)
1186{
1187 if (p >= scm_i_string_length (str))
1188 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1189 str = scm_i_string_start_writing (str);
1190 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1191 scm_i_string_stop_writing ();
3ee86942 1192}
0f2d19dd 1193
3b3b36dd 1194SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1195 (SCM str, SCM start, SCM end),
1196 "Return a newly allocated string formed from the characters\n"
1197 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1198 "ending with index @var{end} (exclusive).\n"
1199 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1200 "exact integers satisfying:\n\n"
1201 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1202#define FUNC_NAME s_scm_substring
0f2d19dd 1203{
3ee86942 1204 size_t len, from, to;
685c0d71 1205
d1ca2c64 1206 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1207 len = scm_i_string_length (str);
1208 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1209 if (SCM_UNBNDP (end))
3ee86942 1210 to = len;
a55c2b68 1211 else
3ee86942
MV
1212 to = scm_to_unsigned_integer (end, from, len);
1213 return scm_i_substring (str, from, to);
0f2d19dd 1214}
1bbd0b84 1215#undef FUNC_NAME
0f2d19dd 1216
ed35de72
MV
1217SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1218 (SCM str, SCM start, SCM end),
1219 "Return a newly allocated string formed from the characters\n"
1220 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1221 "ending with index @var{end} (exclusive).\n"
1222 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1223 "exact integers satisfying:\n"
1224 "\n"
1225 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1226 "\n"
1227 "The returned string is read-only.\n")
1228#define FUNC_NAME s_scm_substring_read_only
1229{
1230 size_t len, from, to;
1231
1232 SCM_VALIDATE_STRING (1, str);
1233 len = scm_i_string_length (str);
1234 from = scm_to_unsigned_integer (start, 0, len);
1235 if (SCM_UNBNDP (end))
1236 to = len;
1237 else
1238 to = scm_to_unsigned_integer (end, from, len);
1239 return scm_i_substring_read_only (str, from, to);
1240}
1241#undef FUNC_NAME
1242
3ee86942
MV
1243SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1244 (SCM str, SCM start, SCM end),
1245 "Return a newly allocated string formed from the characters\n"
1246 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1247 "ending with index @var{end} (exclusive).\n"
1248 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1249 "exact integers satisfying:\n\n"
1250 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1251#define FUNC_NAME s_scm_substring_copy
1252{
e1b29f6a
MV
1253 /* For the Scheme version, START is mandatory, but for the C
1254 version, it is optional. See scm_string_copy in srfi-13.c for a
1255 rationale.
1256 */
1257
1258 size_t from, to;
3ee86942
MV
1259
1260 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1261 scm_i_get_substring_spec (scm_i_string_length (str),
1262 start, &from, end, &to);
3ee86942
MV
1263 return scm_i_substring_copy (str, from, to);
1264}
1265#undef FUNC_NAME
1266
1267SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1268 (SCM str, SCM start, SCM end),
1269 "Return string that indirectly refers to the characters\n"
1270 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1271 "ending with index @var{end} (exclusive).\n"
1272 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1273 "exact integers satisfying:\n\n"
1274 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1275#define FUNC_NAME s_scm_substring_shared
1276{
1277 size_t len, from, to;
1278
1279 SCM_VALIDATE_STRING (1, str);
1280 len = scm_i_string_length (str);
1281 from = scm_to_unsigned_integer (start, 0, len);
1282 if (SCM_UNBNDP (end))
1283 to = len;
1284 else
1285 to = scm_to_unsigned_integer (end, from, len);
1286 return scm_i_substring_shared (str, from, to);
1287}
1288#undef FUNC_NAME
685c0d71 1289
3b3b36dd 1290SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1291 (SCM args),
9c44cd45 1292 "Return a newly allocated string whose characters form the\n"
0d26a824 1293 "concatenation of the given strings, @var{args}.")
1bbd0b84 1294#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1295{
1296 SCM res;
9c44cd45
MG
1297 size_t len = 0;
1298 int wide = 0;
c829a427 1299 SCM l, s;
bd4911ef 1300 size_t i;
9909c395
MG
1301 union
1302 {
1303 char *narrow;
1304 scm_t_wchar *wide;
1305 } data;
af45e3b0
DH
1306
1307 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1308 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1309 {
1310 s = SCM_CAR (l);
1311 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1312 len += scm_i_string_length (s);
1313 if (!scm_i_is_narrow_string (s))
1314 wide = 1;
c829a427 1315 }
9909c395 1316 data.narrow = NULL;
9c44cd45 1317 if (!wide)
9909c395 1318 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1319 else
9909c395 1320 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1321
1322 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1323 {
edea856c 1324 size_t len;
c829a427 1325 s = SCM_CAR (l);
3ee86942 1326 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1327 len = scm_i_string_length (s);
9c44cd45
MG
1328 if (!wide)
1329 {
9909c395
MG
1330 memcpy (data.narrow, scm_i_string_chars (s), len);
1331 data.narrow += len;
9c44cd45
MG
1332 }
1333 else
1334 {
1335 if (scm_i_is_narrow_string (s))
1336 {
1337 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1338 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1339 }
1340 else
9909c395 1341 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1342 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1343 data.wide += len;
9c44cd45 1344 }
c829a427
MV
1345 scm_remember_upto_here_1 (s);
1346 }
0f2d19dd
JB
1347 return res;
1348}
1bbd0b84 1349#undef FUNC_NAME
0f2d19dd 1350
c829a427
MV
1351int
1352scm_is_string (SCM obj)
1353{
3ee86942 1354 return IS_STRING (obj);
c829a427 1355}
24933780 1356
c829a427
MV
1357SCM
1358scm_from_locale_stringn (const char *str, size_t len)
1359{
1360 SCM res;
1361 char *dst;
4d4528e7 1362
9c44cd45 1363 if (len == (size_t) -1)
c829a427 1364 len = strlen (str);
9c44cd45
MG
1365 if (len == 0)
1366 return scm_nullstr;
1367
3ee86942 1368 res = scm_i_make_string (len, &dst);
c829a427
MV
1369 memcpy (dst, str, len);
1370 return res;
1371}
4d4528e7 1372
c829a427
MV
1373SCM
1374scm_from_locale_string (const char *str)
4d4528e7 1375{
9c44cd45
MG
1376 if (str == NULL)
1377 return scm_nullstr;
1378
c829a427
MV
1379 return scm_from_locale_stringn (str, -1);
1380}
4d4528e7 1381
50b1996f
MG
1382/* Create a new scheme string from the C string STR. The memory of
1383 STR may be used directly as storage for the new string. */
c829a427
MV
1384SCM
1385scm_take_locale_stringn (char *str, size_t len)
1386{
48ddf0d9
KR
1387 SCM buf, res;
1388
9c44cd45 1389 if (len == (size_t) -1)
48ddf0d9 1390 len = strlen (str);
c829a427
MV
1391 else
1392 {
48ddf0d9
KR
1393 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1394 often be satisfied from the alignment padding after the block, with
1395 no actual data movement. */
9c44cd45 1396 str = scm_realloc (str, len + 1);
48ddf0d9 1397 str[len] = '\0';
c829a427 1398 }
c829a427 1399
fd0a5bbc 1400 buf = scm_i_take_stringbufn (str, len);
3ee86942 1401 res = scm_double_cell (STRING_TAG,
9c44cd45 1402 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
c829a427
MV
1403 return res;
1404}
1405
48ddf0d9
KR
1406SCM
1407scm_take_locale_string (char *str)
1408{
1409 return scm_take_locale_stringn (str, -1);
1410}
1411
9c44cd45
MG
1412/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1413 and \UXXXXXX. */
1414static void
1415unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1416{
1417 char *before, *after;
1418 size_t i, j;
1419
1420 before = *bufp;
1421 after = *bufp;
1422 i = 0;
1423 j = 0;
1424 while (i < *lenp)
1425 {
1426 if ((i <= *lenp - 6)
1427 && before[i] == '\\'
1428 && before[i + 1] == 'u'
1429 && before[i + 2] == '0' && before[i + 3] == '0')
1430 {
1431 /* Convert \u00NN to \xNN */
1432 after[j] = '\\';
1433 after[j + 1] = 'x';
30a6b9ca
MG
1434 after[j + 2] = tolower ((int) before[i + 4]);
1435 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1436 i += 6;
1437 j += 4;
1438 }
1439 else if ((i <= *lenp - 10)
1440 && before[i] == '\\'
1441 && before[i + 1] == 'U'
1442 && before[i + 2] == '0' && before[i + 3] == '0')
1443 {
1444 /* Convert \U00NNNNNN to \UNNNNNN */
1445 after[j] = '\\';
1446 after[j + 1] = 'U';
30a6b9ca
MG
1447 after[j + 2] = tolower ((int) before[i + 4]);
1448 after[j + 3] = tolower ((int) before[i + 5]);
1449 after[j + 4] = tolower ((int) before[i + 6]);
1450 after[j + 5] = tolower ((int) before[i + 7]);
1451 after[j + 6] = tolower ((int) before[i + 8]);
1452 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1453 i += 10;
1454 j += 8;
1455 }
1456 else
1457 {
1458 after[j] = before[i];
1459 i++;
1460 j++;
1461 }
1462 }
1463 *lenp = j;
1464 after = scm_realloc (after, j);
1465}
1466
c829a427 1467char *
9c44cd45 1468scm_to_locale_stringn (SCM str, size_t * lenp)
c829a427 1469{
9c44cd45
MG
1470 const char *enc;
1471
1472 /* In the future, enc will hold the port's encoding. */
1473 enc = NULL;
1474
1475 return scm_to_stringn (str, lenp, enc, iconveh_escape_sequence);
1476}
1477
1478/* Low-level scheme to C string conversion function. */
1479char *
1480scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
1481 enum iconv_ilseq_handler handler)
1482{
1483 static const char iso[11] = "ISO-8859-1";
1484 char *buf;
1485 size_t ilen, len, i;
4d4528e7 1486
3ee86942 1487 if (!scm_is_string (str))
c829a427 1488 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1489 ilen = scm_i_string_length (str);
1490
1491 if (ilen == 0)
1492 {
1493 buf = scm_malloc (1);
1494 buf[0] = '\0';
1495 if (lenp)
1496 *lenp = 0;
1497 return buf;
1498 }
1499
c829a427 1500 if (lenp == NULL)
9c44cd45
MG
1501 for (i = 0; i < ilen; i++)
1502 if (scm_i_string_ref (str, i) == '\0')
1503 scm_misc_error (NULL,
1504 "string contains #\\nul character: ~S",
1505 scm_list_1 (str));
1506
1507 if (scm_i_is_narrow_string (str))
c829a427 1508 {
9c44cd45
MG
1509 if (lenp)
1510 {
1511 buf = scm_malloc (ilen);
1512 memcpy (buf, scm_i_string_chars (str), ilen);
1513 *lenp = ilen;
1514 return buf;
1515 }
1516 else
1517 {
1518 buf = scm_malloc (ilen + 1);
1519 memcpy (buf, scm_i_string_chars (str), ilen);
1520 buf[ilen] = '\0';
1521 return buf;
1522 }
c829a427 1523 }
9c44cd45
MG
1524
1525
1526 buf = NULL;
1527 len = 0;
1528 buf = u32_conv_to_encoding (iso,
1529 handler,
1530 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1531 ilen, NULL, NULL, &len);
1532 if (buf == NULL)
1533 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1534 scm_list_2 (scm_from_locale_string (iso), str));
1535
1536 if (handler == iconveh_escape_sequence)
1537 unistring_escapes_to_guile_escapes (&buf, &len);
1538
1539 if (lenp)
4d4528e7 1540 *lenp = len;
9c44cd45
MG
1541 else
1542 {
1543 buf = scm_realloc (buf, len + 1);
1544 buf[len] = '\0';
1545 }
24933780 1546
c829a427 1547 scm_remember_upto_here_1 (str);
9c44cd45 1548 return buf;
4d4528e7 1549}
af68e5e5 1550
c829a427
MV
1551char *
1552scm_to_locale_string (SCM str)
1553{
1554 return scm_to_locale_stringn (str, NULL);
1555}
af68e5e5 1556
c829a427
MV
1557size_t
1558scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1559{
1560 size_t len;
9c44cd45 1561 char *result = NULL;
3ee86942 1562 if (!scm_is_string (str))
c829a427 1563 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1564 result = scm_to_locale_stringn (str, &len);
1565
1566 memcpy (buf, result, (len > max_len) ? max_len : len);
1567 free (result);
1568
c829a427
MV
1569 scm_remember_upto_here_1 (str);
1570 return len;
1571}
af68e5e5 1572
3ee86942
MV
1573/* converts C scm_array of strings to SCM scm_list of strings. */
1574/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1575SCM
3ee86942
MV
1576scm_makfromstrs (int argc, char **argv)
1577{
1578 int i = argc;
1579 SCM lst = SCM_EOL;
1580 if (0 > i)
1581 for (i = 0; argv[i]; i++);
1582 while (i--)
1583 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1584 return lst;
1585}
1586
c829a427
MV
1587/* Return a newly allocated array of char pointers to each of the strings
1588 in args, with a terminating NULL pointer. */
1589
1590char **
1591scm_i_allocate_string_pointers (SCM list)
af68e5e5 1592{
c829a427
MV
1593 char **result;
1594 int len = scm_ilength (list);
1595 int i;
1596
1597 if (len < 0)
1598 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1599
661ae7ab 1600 scm_dynwind_begin (0);
c829a427
MV
1601
1602 result = (char **) scm_malloc ((len + 1) * sizeof (char *));
1603 result[len] = NULL;
661ae7ab 1604 scm_dynwind_unwind_handler (free, result, 0);
c829a427
MV
1605
1606 /* The list might be have been modified in another thread, so
1607 we check LIST before each access.
1608 */
d2e53ed6 1609 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427
MV
1610 {
1611 result[i] = scm_to_locale_string (SCM_CAR (list));
1612 list = SCM_CDR (list);
1613 }
1614
661ae7ab 1615 scm_dynwind_end ();
c829a427 1616 return result;
af68e5e5 1617}
e53cc817 1618
c829a427
MV
1619void
1620scm_i_free_string_pointers (char **pointers)
1621{
1622 int i;
1623
1624 for (i = 0; pointers[i]; i++)
1625 free (pointers[i]);
1626 free (pointers);
1627}
24933780 1628
6f14f578
MV
1629void
1630scm_i_get_substring_spec (size_t len,
1631 SCM start, size_t *cstart,
1632 SCM end, size_t *cend)
1633{
1634 if (SCM_UNBNDP (start))
1635 *cstart = 0;
1636 else
1637 *cstart = scm_to_unsigned_integer (start, 0, len);
1638
1639 if (SCM_UNBNDP (end))
1640 *cend = len;
1641 else
1642 *cend = scm_to_unsigned_integer (end, *cstart, len);
1643}
1644
3ee86942
MV
1645#if SCM_ENABLE_DEPRECATED
1646
556d75db
MV
1647/* When these definitions are removed, it becomes reasonable to use
1648 read-only strings for string literals. For that, change the reader
1649 to create string literals with scm_c_substring_read_only instead of
1650 with scm_c_substring_copy.
1651*/
1652
3ee86942 1653int
fe78c51a 1654scm_i_deprecated_stringp (SCM str)
3ee86942
MV
1655{
1656 scm_c_issue_deprecation_warning
1657 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1658
2616f0e0 1659 return scm_is_string (str);
3ee86942
MV
1660}
1661
1662char *
fe78c51a 1663scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
1664{
1665 char *chars;
1666
1667 scm_c_issue_deprecation_warning
1668 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1669
2616f0e0
MV
1670 /* We don't accept shared substrings here since they are not
1671 null-terminated.
1672 */
1673 if (IS_SH_STRING (str))
1674 scm_misc_error (NULL,
1675 "SCM_STRING_CHARS does not work with shared substrings.",
1676 SCM_EOL);
1677
877f06c3 1678 /* We explicitly test for read-only strings to produce a better
556d75db
MV
1679 error message.
1680 */
1681
1682 if (IS_RO_STRING (str))
1683 scm_misc_error (NULL,
1684 "SCM_STRING_CHARS does not work with read-only strings.",
1685 SCM_EOL);
1686
2616f0e0 1687 /* The following is still wrong, of course...
3ee86942 1688 */
9c44cd45 1689 str = scm_i_string_start_writing (str);
3ee86942
MV
1690 chars = scm_i_string_writable_chars (str);
1691 scm_i_string_stop_writing ();
1692 return chars;
1693}
1694
1695size_t
fe78c51a 1696scm_i_deprecated_string_length (SCM str)
3ee86942
MV
1697{
1698 scm_c_issue_deprecation_warning
1699 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1700 return scm_c_string_length (str);
1701}
1702
1703#endif
1704
0f2d19dd
JB
1705void
1706scm_init_strings ()
0f2d19dd 1707{
3ee86942 1708 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 1709
a0599745 1710#include "libguile/strings.x"
0f2d19dd
JB
1711}
1712
89e00824
ML
1713
1714/*
1715 Local Variables:
1716 c-file-style: "gnu"
1717 End:
1718*/