Add full Unicode capability to ports and the default reader
[bpt/guile.git] / libguile / strings.c
CommitLineData
50b1996f 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45
MG
27#include <ctype.h>
28#include <unistr.h>
eca29b02 29#include <uniconv.h>
faf2c9d7 30
889975e5
MG
31#include "striconveh.h"
32
a0599745
MD
33#include "libguile/_scm.h"
34#include "libguile/chars.h"
7c33806a 35#include "libguile/root.h"
a0599745 36#include "libguile/strings.h"
1afff620 37#include "libguile/deprecation.h"
a0599745 38#include "libguile/validate.h"
c829a427 39#include "libguile/dynwind.h"
1afff620 40
0f2d19dd
JB
41\f
42
43/* {Strings}
44 */
45
3ee86942
MV
46
47/* Stringbufs
48 *
49 * XXX - keeping an accurate refcount during GC seems to be quite
50 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 51 * shared, not whether it definitely is.
3ee86942
MV
52 *
53 * The scheme I (mvo) tried to keep an accurate reference count would
54 * recount all strings that point to a stringbuf during the mark-phase
55 * of the GC. This was done since one cannot access the stringbuf of
56 * a string when that string is freed (in order to decrease the
57 * reference count). The memory of the stringbuf might have been
58 * reused already for something completely different.
59 *
60 * This recounted worked for a small number of threads beating on
61 * cow-strings, but it failed randomly with more than 10 threads, say.
62 * I couldn't figure out what went wrong, so I used the conservative
63 * approach implemented below.
64 *
65 * A stringbuf needs to know its length, but only so that it can be
66 * reported when the stringbuf is freed.
67 *
50b1996f
MG
68 * There are 3 storage strategies for stringbufs: inline, outline, and
69 * wide.
70 *
71 * Inline strings are small 8-bit strings stored within the double
72 * cell itself. Outline strings are larger 8-bit strings with GC
73 * allocated storage. Wide strings are 32-bit strings with allocated
74 * storage.
75 *
76 * There was little value in making wide string inlineable, since
77 * there is only room for three inlined 32-bit characters. Thus wide
78 * stringbufs are never inlined.
3ee86942
MV
79 */
80
81#define STRINGBUF_F_SHARED 0x100
82#define STRINGBUF_F_INLINE 0x200
50b1996f
MG
83#define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
84 encoding. Otherwise, strings
85 are Latin-1. */
3ee86942
MV
86
87#define STRINGBUF_TAG scm_tc7_stringbuf
88#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
89#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
9c44cd45 90#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 91
f59cf998 92#define STRINGBUF_OUTLINE_CHARS(buf) ((unsigned char *) SCM_CELL_WORD_1(buf))
3ee86942 93#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
f59cf998 94#define STRINGBUF_INLINE_CHARS(buf) ((unsigned char *) SCM_CELL_OBJECT_LOC(buf,1))
3ee86942
MV
95#define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
96
97#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
98 ? STRINGBUF_INLINE_CHARS (buf) \
99 : STRINGBUF_OUTLINE_CHARS (buf))
50b1996f 100
f59cf998 101#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) SCM_CELL_WORD_1(buf))
3ee86942
MV
102#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
103 ? STRINGBUF_INLINE_LENGTH (buf) \
104 : STRINGBUF_OUTLINE_LENGTH (buf))
105
106#define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
107
108#define SET_STRINGBUF_SHARED(buf) \
109 (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
110
6ce6923b 111#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
112static size_t lenhist[1001];
113#endif
114
50b1996f
MG
115/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
116 characters. */
3ee86942
MV
117static SCM
118make_stringbuf (size_t len)
0f2d19dd 119{
3ee86942
MV
120 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
121 scm_i_symbol_chars, all stringbufs are null-terminated. Once
122 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
123 has been changed for scm_i_symbol_chars, this null-termination
124 can be dropped.
125 */
126
6ce6923b 127#if SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
128 if (len < 1000)
129 lenhist[len]++;
130 else
131 lenhist[1000]++;
132#endif
0f2d19dd 133
3ee86942
MV
134 if (len <= STRINGBUF_MAX_INLINE_LEN-1)
135 {
136 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
137 0, 0, 0);
138 }
139 else
140 {
141 char *mem = scm_gc_malloc (len+1, "string");
142 mem[len] = '\0';
143 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
144 (scm_t_bits) len, (scm_t_bits) 0);
145 }
146}
e53cc817 147
50b1996f
MG
148/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
149 characters. */
9c44cd45
MG
150static SCM
151make_wide_stringbuf (size_t len)
152{
153 scm_t_wchar *mem;
6ce6923b 154#if SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
155 if (len < 1000)
156 lenhist[len]++;
157 else
158 lenhist[1000]++;
159#endif
160
161 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
162 mem[len] = 0;
163 return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
164 (scm_t_bits) len, (scm_t_bits) 0);
165}
166
2b829bbb
KR
167/* Return a new stringbuf whose underlying storage consists of the LEN+1
168 octets pointed to by STR (the last octet is zero). */
7f74cf9a 169SCM
fd0a5bbc
HWN
170scm_i_take_stringbufn (char *str, size_t len)
171{
2b829bbb 172 scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
fd0a5bbc
HWN
173
174 return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
175 (scm_t_bits) len, (scm_t_bits) 0);
176}
177
3ee86942
MV
178SCM
179scm_i_stringbuf_mark (SCM buf)
180{
181 return SCM_BOOL_F;
182}
1bbd0b84 183
3ee86942
MV
184void
185scm_i_stringbuf_free (SCM buf)
0f2d19dd 186{
3ee86942 187 if (!STRINGBUF_INLINE (buf))
9c44cd45
MG
188 {
189 if (!STRINGBUF_WIDE (buf))
190 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
191 STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
192 else
193 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
194 sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
195 + 1), "string");
196 }
197
198}
199
50b1996f
MG
200/* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
201 one containing 32-bit UCS-4-encoded characters. */
9c44cd45
MG
202static void
203widen_stringbuf (SCM buf)
204{
205 size_t i, len;
206 scm_t_wchar *mem;
207
208 if (STRINGBUF_WIDE (buf))
209 return;
210
211 if (STRINGBUF_INLINE (buf))
212 {
213 len = STRINGBUF_INLINE_LENGTH (buf);
214
215 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
216 for (i = 0; i < len; i++)
217 mem[i] =
f59cf998 218 (scm_t_wchar) STRINGBUF_INLINE_CHARS (buf)[i];
9c44cd45
MG
219 mem[len] = 0;
220
221 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
222 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
223 SCM_SET_CELL_WORD_1 (buf, mem);
224 SCM_SET_CELL_WORD_2 (buf, len);
225 }
226 else
227 {
228 len = STRINGBUF_OUTLINE_LENGTH (buf);
229
230 mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
231 for (i = 0; i < len; i++)
232 mem[i] =
f59cf998 233 (scm_t_wchar) STRINGBUF_OUTLINE_CHARS (buf)[i];
9c44cd45
MG
234 mem[len] = 0;
235
236 scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
237
238 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
239 SCM_SET_CELL_WORD_1 (buf, mem);
240 SCM_SET_CELL_WORD_2 (buf, len);
241 }
3ee86942 242}
bd9e24b3 243
587a3355
MG
244/* Convert a stringbuf of 32-bit UCS-4-encoded characters to one
245 containing 8-bit Latin-1-encoded characters, if possible. */
246static void
247narrow_stringbuf (SCM buf)
248{
249 size_t i, len;
250 scm_t_wchar *wmem;
251 char *mem;
252
253 if (!STRINGBUF_WIDE (buf))
254 return;
255
256 len = STRINGBUF_OUTLINE_LENGTH (buf);
257 i = 0;
258 wmem = STRINGBUF_WIDE_CHARS (buf);
259 while (i < len)
260 if (wmem[i++] > 0xFF)
261 return;
262
263 mem = scm_gc_malloc (sizeof (char) * (len + 1), "string");
264 for (i = 0; i < len; i++)
265 mem[i] = (unsigned char) wmem[i];
266
267 scm_gc_free (wmem, sizeof (scm_t_wchar) * (len + 1), "string");
268
269 SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_WIDE);
270 SCM_SET_CELL_WORD_1 (buf, mem);
271 SCM_SET_CELL_WORD_2 (buf, len);
272}
273
9de87eea 274scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 275
3ee86942
MV
276/* Copy-on-write strings.
277 */
bd9e24b3 278
3ee86942 279#define STRING_TAG scm_tc7_string
bd9e24b3 280
3ee86942
MV
281#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
282#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
283#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 284
3ee86942
MV
285#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
286#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
287
288#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
289
ed35de72
MV
290/* Read-only strings.
291 */
292
293#define RO_STRING_TAG (scm_tc7_string + 0x200)
294#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
295
e1b29f6a
MV
296/* Mutation-sharing substrings
297 */
298
299#define SH_STRING_TAG (scm_tc7_string + 0x100)
300
301#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
302/* START and LENGTH as for STRINGs. */
303
304#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
305
50b1996f
MG
306/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
307 characters. CHARSP, if not NULL, will be set to location of the
308 char array. */
3ee86942
MV
309SCM
310scm_i_make_string (size_t len, char **charsp)
311{
312 SCM buf = make_stringbuf (len);
313 SCM res;
314 if (charsp)
f59cf998 315 *charsp = (char *) STRINGBUF_CHARS (buf);
3ee86942
MV
316 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
317 (scm_t_bits)0, (scm_t_bits) len);
318 return res;
0f2d19dd
JB
319}
320
50b1996f
MG
321/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
322 characters. CHARSP, if not NULL, will be set to location of the
323 character array. */
9c44cd45 324SCM
50b1996f 325scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
326{
327 SCM buf = make_wide_stringbuf (len);
328 SCM res;
329 if (charsp)
330 *charsp = STRINGBUF_WIDE_CHARS (buf);
331 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
332 (scm_t_bits) 0, (scm_t_bits) len);
333 return res;
334}
335
3ee86942
MV
336static void
337validate_substring_args (SCM str, size_t start, size_t end)
338{
339 if (!IS_STRING (str))
340 scm_wrong_type_arg_msg (NULL, 0, str, "string");
341 if (start > STRING_LENGTH (str))
342 scm_out_of_range (NULL, scm_from_size_t (start));
343 if (end > STRING_LENGTH (str) || end < start)
344 scm_out_of_range (NULL, scm_from_size_t (end));
345}
0f2d19dd 346
e1b29f6a
MV
347static inline void
348get_str_buf_start (SCM *str, SCM *buf, size_t *start)
349{
350 *start = STRING_START (*str);
351 if (IS_SH_STRING (*str))
352 {
353 *str = SH_STRING_STRING (*str);
354 *start += STRING_START (*str);
355 }
356 *buf = STRING_STRINGBUF (*str);
357}
358
3ee86942
MV
359SCM
360scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 361{
e1b29f6a
MV
362 SCM buf;
363 size_t str_start;
364 get_str_buf_start (&str, &buf, &str_start);
9de87eea 365 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 366 SET_STRINGBUF_SHARED (buf);
9de87eea 367 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 368 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
369 (scm_t_bits)str_start + start,
370 (scm_t_bits) end - start);
0f2d19dd
JB
371}
372
ed35de72
MV
373SCM
374scm_i_substring_read_only (SCM str, size_t start, size_t end)
375{
45a9f430
LC
376 SCM buf;
377 size_t str_start;
378 get_str_buf_start (&str, &buf, &str_start);
379 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
380 SET_STRINGBUF_SHARED (buf);
381 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
382 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
383 (scm_t_bits)str_start + start,
384 (scm_t_bits) end - start);
ed35de72
MV
385}
386
3ee86942
MV
387SCM
388scm_i_substring_copy (SCM str, size_t start, size_t end)
389{
390 size_t len = end - start;
edea856c 391 SCM buf, my_buf;
e1b29f6a
MV
392 size_t str_start;
393 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
394 if (scm_i_is_narrow_string (str))
395 {
396 my_buf = make_stringbuf (len);
397 memcpy (STRINGBUF_CHARS (my_buf),
398 STRINGBUF_CHARS (buf) + str_start + start, len);
399 }
400 else
401 {
402 my_buf = make_wide_stringbuf (len);
403 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
404 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
405 + start), len);
406 /* Even though this string is wide, the substring may be narrow.
50b1996f 407 Consider adding code to narrow the string. */
9c44cd45 408 }
3ee86942 409 scm_remember_upto_here_1 (buf);
9c44cd45
MG
410 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
411 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 412}
0f2d19dd 413
e1b29f6a
MV
414SCM
415scm_i_substring_shared (SCM str, size_t start, size_t end)
416{
417 if (start == 0 && end == STRING_LENGTH (str))
418 return str;
419 else
420 {
421 size_t len = end - start;
422 if (IS_SH_STRING (str))
423 {
424 start += STRING_START (str);
425 str = SH_STRING_STRING (str);
426 }
427 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
428 (scm_t_bits)start, (scm_t_bits) len);
429 }
430}
431
3ee86942
MV
432SCM
433scm_c_substring (SCM str, size_t start, size_t end)
434{
435 validate_substring_args (str, start, end);
436 return scm_i_substring (str, start, end);
437}
ee149d03 438
ed35de72
MV
439SCM
440scm_c_substring_read_only (SCM str, size_t start, size_t end)
441{
442 validate_substring_args (str, start, end);
443 return scm_i_substring_read_only (str, start, end);
444}
445
0f2d19dd 446SCM
3ee86942 447scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 448{
3ee86942
MV
449 validate_substring_args (str, start, end);
450 return scm_i_substring_copy (str, start, end);
451}
452
3ee86942
MV
453SCM
454scm_c_substring_shared (SCM str, size_t start, size_t end)
455{
456 validate_substring_args (str, start, end);
457 return scm_i_substring_shared (str, start, end);
458}
0f2d19dd 459
ee149d03 460SCM
3ee86942 461scm_i_string_mark (SCM str)
ee149d03 462{
3ee86942
MV
463 if (IS_SH_STRING (str))
464 return SH_STRING_STRING (str);
465 else
466 return STRING_STRINGBUF (str);
ee149d03
JB
467}
468
3ee86942
MV
469void
470scm_i_string_free (SCM str)
471{
472}
36284627 473
3ee86942
MV
474/* Internal accessors
475 */
476
50b1996f
MG
477/* Returns the number of characters in STR. This may be different
478 than the memory size of the string storage. */
3ee86942
MV
479size_t
480scm_i_string_length (SCM str)
0f2d19dd 481{
3ee86942 482 return STRING_LENGTH (str);
0f2d19dd
JB
483}
484
50b1996f
MG
485/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
486 encoding. False if it is 'wide', having a 32-bit UCS-4
487 encoding. */
9c44cd45
MG
488int
489scm_i_is_narrow_string (SCM str)
490{
491 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
492}
493
587a3355
MG
494/* Try to coerce a string to be narrow. It if is narrow already, do
495 nothing. If it is wide, shrink it to narrow if none of its
496 characters are above 0xFF. Return true if the string is narrow or
497 was made to be narrow. */
498int
499scm_i_try_narrow_string (SCM str)
500{
501 narrow_stringbuf (STRING_STRINGBUF (str));
502
503 return scm_i_is_narrow_string (str);
504}
505
50b1996f
MG
506/* Returns a pointer to the 8-bit Latin-1 encoded character array of
507 STR. */
3ee86942
MV
508const char *
509scm_i_string_chars (SCM str)
510{
511 SCM buf;
e1b29f6a
MV
512 size_t start;
513 get_str_buf_start (&str, &buf, &start);
9c44cd45 514 if (scm_i_is_narrow_string (str))
f59cf998 515 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
516 else
517 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
518 scm_list_1 (str));
519 return NULL;
3ee86942 520}
b00418df 521
50b1996f
MG
522/* Returns a pointer to the 32-bit UCS-4 encoded character array of
523 STR. */
9c44cd45
MG
524const scm_t_wchar *
525scm_i_string_wide_chars (SCM str)
526{
527 SCM buf;
528 size_t start;
529
530 get_str_buf_start (&str, &buf, &start);
531 if (!scm_i_is_narrow_string (str))
f59cf998 532 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
533 else
534 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
535 scm_list_1 (str));
536}
537
538/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
539 a new string buffer, so that it can be modified without modifying
50b1996f
MG
540 other strings. Also, lock the string mutex. Later, one must call
541 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
542SCM
543scm_i_string_start_writing (SCM orig_str)
b00418df 544{
ed35de72 545 SCM buf, str = orig_str;
e1b29f6a 546 size_t start;
ed35de72 547
e1b29f6a 548 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
549 if (IS_RO_STRING (str))
550 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
551
9de87eea 552 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
553 if (STRINGBUF_SHARED (buf))
554 {
9c44cd45 555 /* Clone the stringbuf. */
3ee86942
MV
556 size_t len = STRING_LENGTH (str);
557 SCM new_buf;
558
9de87eea 559 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 560
9c44cd45
MG
561 if (scm_i_is_narrow_string (str))
562 {
563 new_buf = make_stringbuf (len);
564 memcpy (STRINGBUF_CHARS (new_buf),
565 STRINGBUF_CHARS (buf) + STRING_START (str), len);
566
567 }
568 else
569 {
570 new_buf = make_wide_stringbuf (len);
571 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
572 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
573 + STRING_START (str)), len);
574 }
3ee86942
MV
575 scm_i_thread_put_to_sleep ();
576 SET_STRING_STRINGBUF (str, new_buf);
577 start -= STRING_START (str);
578 SET_STRING_START (str, 0);
579 scm_i_thread_wake_up ();
580
581 buf = new_buf;
582
9de87eea 583 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 584 }
9c44cd45
MG
585 return orig_str;
586}
587
50b1996f 588/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
589char *
590scm_i_string_writable_chars (SCM str)
591{
592 SCM buf;
593 size_t start;
3ee86942 594
9c44cd45
MG
595 get_str_buf_start (&str, &buf, &start);
596 if (scm_i_is_narrow_string (str))
f59cf998 597 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
598 else
599 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
600 scm_list_1 (str));
601 return NULL;
602}
603
50b1996f 604/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
605static scm_t_wchar *
606scm_i_string_writable_wide_chars (SCM str)
607{
608 SCM buf;
609 size_t start;
610
611 get_str_buf_start (&str, &buf, &start);
612 if (!scm_i_is_narrow_string (str))
613 return STRINGBUF_WIDE_CHARS (buf) + start;
614 else
1c7b216f 615 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 616 scm_list_1 (str));
b00418df
DH
617}
618
50b1996f
MG
619/* Unlock the string mutex that was locked when
620 scm_i_string_start_writing was called. */
3ee86942
MV
621void
622scm_i_string_stop_writing (void)
623{
9de87eea 624 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 625}
b00418df 626
50b1996f 627/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
628scm_t_wchar
629scm_i_string_ref (SCM str, size_t x)
630{
631 if (scm_i_is_narrow_string (str))
632 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
633 else
634 return scm_i_string_wide_chars (str)[x];
635}
636
889975e5
MG
637/* Returns index+1 of the first char in STR that matches C, or
638 0 if the char is not found. */
639int
640scm_i_string_contains_char (SCM str, char ch)
641{
642 size_t i;
643 size_t len = scm_i_string_length (str);
644
645 i = 0;
646 if (scm_i_is_narrow_string (str))
647 {
648 while (i < len)
649 {
650 if (scm_i_string_chars (str)[i] == ch)
651 return i+1;
652 i++;
653 }
654 }
655 else
656 {
657 while (i < len)
658 {
659 if (scm_i_string_wide_chars (str)[i]
660 == (unsigned char) ch)
661 return i+1;
662 i++;
663 }
664 }
665 return 0;
666}
667
3f47e526
MG
668int
669scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
670{
671 if (scm_i_is_narrow_string (sstr))
672 {
673 const char *a = scm_i_string_chars (sstr) + start_x;
674 const char *b = cstr;
675 return strncmp (a, b, strlen(b));
676 }
677 else
678 {
679 size_t i;
680 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
681 const char *b = cstr;
682 for (i = 0; i < strlen (b); i++)
683 {
684 if (a[i] != (unsigned char) b[i])
685 return 1;
686 }
687 }
688 return 0;
689}
690
50b1996f 691/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
692void
693scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
694{
695 if (chr > 0xFF && scm_i_is_narrow_string (str))
696 widen_stringbuf (STRING_STRINGBUF (str));
697
698 if (scm_i_is_narrow_string (str))
699 {
700 char *dst = scm_i_string_writable_chars (str);
587a3355 701 dst[p] = chr;
9c44cd45
MG
702 }
703 else
704 {
705 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
706 dst[p] = chr;
707 }
708}
709
3ee86942 710/* Symbols.
587a3355 711
3ee86942
MV
712 Basic symbol creation and accessing is done here, the rest is in
713 symbols.[hc]. This has been done to keep stringbufs and the
714 internals of strings and string-like objects confined to this file.
715*/
716
717#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
718
719SCM
6869328b
MV
720scm_i_make_symbol (SCM name, scm_t_bits flags,
721 unsigned long hash, SCM props)
3ee86942
MV
722{
723 SCM buf;
724 size_t start = STRING_START (name);
725 size_t length = STRING_LENGTH (name);
726
727 if (IS_SH_STRING (name))
728 {
729 name = SH_STRING_STRING (name);
730 start += STRING_START (name);
731 }
732 buf = SYMBOL_STRINGBUF (name);
733
734 if (start == 0 && length == STRINGBUF_LENGTH (buf))
735 {
736 /* reuse buf. */
9de87eea 737 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 738 SET_STRINGBUF_SHARED (buf);
9de87eea 739 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
740 }
741 else
742 {
743 /* make new buf. */
9c44cd45
MG
744 if (scm_i_is_narrow_string (name))
745 {
746 SCM new_buf = make_stringbuf (length);
747 memcpy (STRINGBUF_CHARS (new_buf),
748 STRINGBUF_CHARS (buf) + start, length);
749 buf = new_buf;
750 }
751 else
752 {
753 SCM new_buf = make_wide_stringbuf (length);
754 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
755 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
756 length);
757 buf = new_buf;
758 }
3ee86942 759 }
6869328b 760 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
761 (scm_t_bits) hash, SCM_UNPACK (props));
762}
763
fd0a5bbc
HWN
764SCM
765scm_i_c_make_symbol (const char *name, size_t len,
766 scm_t_bits flags, unsigned long hash, SCM props)
767{
768 SCM buf = make_stringbuf (len);
769 memcpy (STRINGBUF_CHARS (buf), name, len);
770
771 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
772 (scm_t_bits) hash, SCM_UNPACK (props));
773}
774
775/* Return a new symbol that uses the LEN bytes pointed to by NAME as its
776 underlying storage. */
777SCM
778scm_i_c_take_symbol (char *name, size_t len,
779 scm_t_bits flags, unsigned long hash, SCM props)
780{
781 SCM buf = scm_i_take_stringbufn (name, len);
782
783 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
784 (scm_t_bits) hash, SCM_UNPACK (props));
785}
786
50b1996f
MG
787/* Returns the number of characters in SYM. This may be different
788 from the memory size of SYM. */
3ee86942
MV
789size_t
790scm_i_symbol_length (SCM sym)
0f2d19dd 791{
3ee86942 792 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
793}
794
071bb6a8
LC
795size_t
796scm_c_symbol_length (SCM sym)
797#define FUNC_NAME "scm_c_symbol_length"
798{
799 SCM_VALIDATE_SYMBOL (1, sym);
800
801 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
802}
803#undef FUNC_NAME
804
50b1996f
MG
805/* True if the name of SYM is stored as a Latin-1 encoded string.
806 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
807int
808scm_i_is_narrow_symbol (SCM sym)
809{
810 SCM buf;
811
812 buf = SYMBOL_STRINGBUF (sym);
813 return !STRINGBUF_WIDE (buf);
814}
815
50b1996f
MG
816/* Returns a pointer to the 8-bit Latin-1 encoded character array that
817 contains the name of SYM. */
3ee86942
MV
818const char *
819scm_i_symbol_chars (SCM sym)
820{
9c44cd45
MG
821 SCM buf;
822
823 buf = SYMBOL_STRINGBUF (sym);
824 if (!STRINGBUF_WIDE (buf))
f59cf998 825 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
826 else
827 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
828 scm_list_1 (sym));
829}
830
50b1996f
MG
831/* Return a pointer to the 32-bit UCS-4-encoded character array of a
832 symbol's name. */
9c44cd45
MG
833const scm_t_wchar *
834scm_i_symbol_wide_chars (SCM sym)
835{
836 SCM buf;
837
838 buf = SYMBOL_STRINGBUF (sym);
839 if (STRINGBUF_WIDE (buf))
f59cf998 840 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
841 else
842 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
843 scm_list_1 (sym));
3ee86942 844}
1cc91f1b 845
3ee86942
MV
846SCM
847scm_i_symbol_mark (SCM sym)
0f2d19dd 848{
3ee86942
MV
849 scm_gc_mark (SYMBOL_STRINGBUF (sym));
850 return SCM_CELL_OBJECT_3 (sym);
0f2d19dd
JB
851}
852
3ee86942
MV
853void
854scm_i_symbol_free (SCM sym)
855{
856}
0f2d19dd 857
be54b15d 858SCM
3ee86942 859scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 860{
3ee86942 861 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 862 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 863 SET_STRINGBUF_SHARED (buf);
9de87eea 864 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 865 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
866 (scm_t_bits)start, (scm_t_bits) end - start);
867}
be54b15d 868
50b1996f 869/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
870scm_t_wchar
871scm_i_symbol_ref (SCM sym, size_t x)
872{
873 if (scm_i_is_narrow_symbol (sym))
874 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
875 else
876 return scm_i_symbol_wide_chars (sym)[x];
877}
878
3ee86942
MV
879/* Debugging
880 */
be54b15d 881
6ce6923b
MG
882SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
883 "Returns an association list containing debugging information\n"
884 "for @var{str}. The association list has the following entries."
885 "@table @code\n"
886 "@item string\n"
887 "The string itself.\n"
888 "@item start\n"
889 "The start index of the string into its stringbuf\n"
890 "@item length\n"
891 "The length of the string\n"
892 "@item shared\n"
893 "If this string is a substring, it returns its parent string.\n"
894 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
895 "@item read-only\n"
896 "@code{#t} if the string is read-only\n"
6ce6923b
MG
897 "@item stringbuf-chars\n"
898 "A new string containing this string's stringbuf's characters\n"
899 "@item stringbuf-length\n"
900 "The number of characters in this stringbuf\n"
901 "@item stringbuf-shared\n"
902 "@code{#t} if this stringbuf is shared\n"
903 "@item stringbuf-inline\n"
904 "@code{#t} if this stringbuf's characters are stored in the\n"
905 "cell itself, or @code{#f} if they were allocated in memory\n"
906 "@item stringbuf-wide\n"
907 "@code{#t} if this stringbuf's characters are stored in a\n"
908 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
909 "buffer\n"
910 "@end table")
3ee86942
MV
911#define FUNC_NAME s_scm_sys_string_dump
912{
6ce6923b
MG
913 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
914 SCM buf;
3ee86942 915 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
916
917 /* String info */
918 e1 = scm_cons (scm_from_locale_symbol ("string"),
919 str);
920 e2 = scm_cons (scm_from_locale_symbol ("start"),
921 scm_from_size_t (STRING_START (str)));
922 e3 = scm_cons (scm_from_locale_symbol ("length"),
923 scm_from_size_t (STRING_LENGTH (str)));
924
3ee86942
MV
925 if (IS_SH_STRING (str))
926 {
6ce6923b
MG
927 e4 = scm_cons (scm_from_locale_symbol ("shared"),
928 SH_STRING_STRING (str));
929 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
930 }
931 else
932 {
6ce6923b
MG
933 e4 = scm_cons (scm_from_locale_symbol ("shared"),
934 SCM_BOOL_F);
935 buf = STRING_STRINGBUF (str);
936 }
9c44cd45 937
88ed5759
MG
938 if (IS_RO_STRING (str))
939 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
940 SCM_BOOL_T);
941 else
942 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
943 SCM_BOOL_F);
587a3355 944
6ce6923b 945 /* Stringbuf info */
6ce6923b
MG
946 if (!STRINGBUF_WIDE (buf))
947 {
948 size_t len = STRINGBUF_LENGTH (buf);
949 char *cbuf;
950 SCM sbc = scm_i_make_string (len, &cbuf);
951 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
952 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
953 sbc);
3ee86942 954 }
6ce6923b
MG
955 else
956 {
957 size_t len = STRINGBUF_LENGTH (buf);
958 scm_t_wchar *cbuf;
959 SCM sbc = scm_i_make_wide_string (len, &cbuf);
960 u32_cpy ((scm_t_uint32 *) cbuf,
961 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
962 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
963 sbc);
964 }
965 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
966 scm_from_size_t (STRINGBUF_LENGTH (buf)));
967 if (STRINGBUF_SHARED (buf))
968 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
969 SCM_BOOL_T);
970 else
971 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
972 SCM_BOOL_F);
973 if (STRINGBUF_INLINE (buf))
974 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
975 SCM_BOOL_T);
976 else
977 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
978 SCM_BOOL_F);
979 if (STRINGBUF_WIDE (buf))
980 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
981 SCM_BOOL_T);
982 else
983 e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
984 SCM_BOOL_F);
985
986 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
3ee86942
MV
987}
988#undef FUNC_NAME
989
6ce6923b
MG
990SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
991 "Returns an association list containing debugging information\n"
992 "for @var{sym}. The association list has the following entries."
993 "@table @code\n"
994 "@item symbol\n"
995 "The symbol itself\n"
996 "@item hash\n"
997 "Its hash value\n"
88ed5759
MG
998 "@item interned\n"
999 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
1000 "@item stringbuf-chars\n"
1001 "A new string containing this symbols's stringbuf's characters\n"
1002 "@item stringbuf-length\n"
1003 "The number of characters in this stringbuf\n"
1004 "@item stringbuf-shared\n"
1005 "@code{#t} if this stringbuf is shared\n"
1006 "@item stringbuf-inline\n"
1007 "@code{#t} if this stringbuf's characters are stored in the\n"
1008 "cell itself, or @code{#f} if they were allocated in memory\n"
1009 "@item stringbuf-wide\n"
1010 "@code{#t} if this stringbuf's characters are stored in a\n"
1011 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1012 "buffer\n"
1013 "@end table")
3ee86942
MV
1014#define FUNC_NAME s_scm_sys_symbol_dump
1015{
6ce6923b
MG
1016 SCM e1, e2, e3, e4, e5, e6, e7, e8;
1017 SCM buf;
3ee86942 1018 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
1019 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
1020 sym);
1021 e2 = scm_cons (scm_from_locale_symbol ("hash"),
1022 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
1023 e3 = scm_cons (scm_from_locale_symbol ("interned"),
1024 scm_symbol_interned_p (sym));
6ce6923b
MG
1025 buf = SYMBOL_STRINGBUF (sym);
1026
1027 /* Stringbuf info */
6ce6923b
MG
1028 if (!STRINGBUF_WIDE (buf))
1029 {
1030 size_t len = STRINGBUF_LENGTH (buf);
1031 char *cbuf;
1032 SCM sbc = scm_i_make_string (len, &cbuf);
1033 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
1034 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
1035 sbc);
1036 }
9c44cd45 1037 else
6ce6923b
MG
1038 {
1039 size_t len = STRINGBUF_LENGTH (buf);
1040 scm_t_wchar *cbuf;
1041 SCM sbc = scm_i_make_wide_string (len, &cbuf);
1042 u32_cpy ((scm_t_uint32 *) cbuf,
1043 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
1044 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
1045 sbc);
1046 }
1047 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
1048 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1049 if (STRINGBUF_SHARED (buf))
1050 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
1051 SCM_BOOL_T);
1052 else
1053 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
1054 SCM_BOOL_F);
1055 if (STRINGBUF_INLINE (buf))
1056 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
1057 SCM_BOOL_T);
1058 else
1059 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
1060 SCM_BOOL_F);
1061 if (STRINGBUF_WIDE (buf))
1062 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
1063 SCM_BOOL_T);
1064 else
1065 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
1066 SCM_BOOL_F);
1067 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
1068
3ee86942
MV
1069}
1070#undef FUNC_NAME
1071
6ce6923b
MG
1072#if SCM_STRING_LENGTH_HISTOGRAM
1073
9c44cd45 1074SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1075#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1076{
1077 int i;
1078 for (i = 0; i < 1000; i++)
1079 if (lenhist[i])
1080 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1081 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1082 return SCM_UNSPECIFIED;
be54b15d
DH
1083}
1084#undef FUNC_NAME
1085
3ee86942
MV
1086#endif
1087
1088\f
1089
1090SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1091 (SCM obj),
1092 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1093#define FUNC_NAME s_scm_string_p
1094{
1095 return scm_from_bool (IS_STRING (obj));
1096}
1097#undef FUNC_NAME
1098
1099
1100SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1101
1102SCM_DEFINE (scm_string, "string", 0, 0, 1,
1103 (SCM chrs),
1104 "@deffnx {Scheme Procedure} list->string chrs\n"
1105 "Return a newly allocated string composed of the arguments,\n"
1106 "@var{chrs}.")
1107#define FUNC_NAME s_scm_string
1108{
9aa27c1a 1109 SCM result = SCM_BOOL_F;
9c44cd45 1110 SCM rest;
3ee86942 1111 size_t len;
9c44cd45
MG
1112 size_t p = 0;
1113 long i;
9aa27c1a 1114 int wide = 0;
3ee86942 1115
9c44cd45
MG
1116 /* Verify that this is a list of chars. */
1117 i = scm_ilength (chrs);
3c7cf7f5
MG
1118 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1119
9c44cd45
MG
1120 len = (size_t) i;
1121 rest = chrs;
3ee86942 1122
9c44cd45 1123 while (len > 0 && scm_is_pair (rest))
3ee86942 1124 {
9c44cd45 1125 SCM elt = SCM_CAR (rest);
3ee86942 1126 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1127 if (SCM_CHAR (elt) > 0xFF)
1128 wide = 1;
9c44cd45
MG
1129 rest = SCM_CDR (rest);
1130 len--;
1131 scm_remember_upto_here_1 (elt);
1132 }
1133
1134 /* Construct a string containing this list of chars. */
1135 len = (size_t) i;
1136 rest = chrs;
1137
9aa27c1a 1138 if (wide == 0)
9c44cd45 1139 {
9aa27c1a
MG
1140 result = scm_i_make_string (len, NULL);
1141 result = scm_i_string_start_writing (result);
1142 char *buf = scm_i_string_writable_chars (result);
1143 while (len > 0 && scm_is_pair (rest))
1144 {
1145 SCM elt = SCM_CAR (rest);
1146 buf[p] = (unsigned char) SCM_CHAR (elt);
1147 p++;
1148 rest = SCM_CDR (rest);
1149 len--;
1150 scm_remember_upto_here_1 (elt);
1151 }
1152 }
1153 else
1154 {
1155 result = scm_i_make_wide_string (len, NULL);
1156 result = scm_i_string_start_writing (result);
1157 scm_t_wchar *buf = scm_i_string_writable_wide_chars (result);
1158 while (len > 0 && scm_is_pair (rest))
1159 {
1160 SCM elt = SCM_CAR (rest);
1161 buf[p] = SCM_CHAR (elt);
1162 p++;
1163 rest = SCM_CDR (rest);
1164 len--;
1165 scm_remember_upto_here_1 (elt);
1166 }
3ee86942 1167 }
9c44cd45
MG
1168 scm_i_string_stop_writing ();
1169
3ee86942
MV
1170 if (len > 0)
1171 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1172 if (!scm_is_null (rest))
3ee86942
MV
1173 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1174
1175 return result;
1176}
1177#undef FUNC_NAME
be54b15d 1178
3b3b36dd 1179SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1180 (SCM k, SCM chr),
0d26a824
MG
1181 "Return a newly allocated string of\n"
1182 "length @var{k}. If @var{chr} is given, then all elements of\n"
1183 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1184 "of the @var{string} are unspecified.")
1bbd0b84 1185#define FUNC_NAME s_scm_make_string
0f2d19dd 1186{
3ee86942
MV
1187 return scm_c_make_string (scm_to_size_t (k), chr);
1188}
1189#undef FUNC_NAME
1190
1191SCM
1192scm_c_make_string (size_t len, SCM chr)
1193#define FUNC_NAME NULL
1194{
9c44cd45
MG
1195 size_t p;
1196 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1197
e11e83f3
MV
1198 if (!SCM_UNBNDP (chr))
1199 {
3ee86942 1200 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1201 res = scm_i_string_start_writing (res);
1202 for (p = 0; p < len; p++)
1203 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1204 scm_i_string_stop_writing ();
0f2d19dd 1205 }
e11e83f3
MV
1206
1207 return res;
0f2d19dd 1208}
1bbd0b84 1209#undef FUNC_NAME
0f2d19dd 1210
3b3b36dd 1211SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1212 (SCM string),
1213 "Return the number of characters in @var{string}.")
1bbd0b84 1214#define FUNC_NAME s_scm_string_length
0f2d19dd 1215{
d1ca2c64 1216 SCM_VALIDATE_STRING (1, string);
3ee86942 1217 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1218}
1bbd0b84 1219#undef FUNC_NAME
0f2d19dd 1220
f8ba2bb9 1221SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1222 (SCM string),
1223 "Return the bytes used to represent a character in @var{string}."
1224 "This will return 1 or 4.")
f8ba2bb9 1225#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1226{
1227 SCM_VALIDATE_STRING (1, string);
1228 if (!scm_i_is_narrow_string (string))
1229 return scm_from_int (4);
1230
1231 return scm_from_int (1);
1232}
1233#undef FUNC_NAME
1234
3ee86942
MV
1235size_t
1236scm_c_string_length (SCM string)
1237{
1238 if (!IS_STRING (string))
1239 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1240 return STRING_LENGTH (string);
1241}
1242
bd9e24b3 1243SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1244 (SCM str, SCM k),
9c44cd45
MG
1245 "Return character @var{k} of @var{str} using zero-origin\n"
1246 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1247#define FUNC_NAME s_scm_string_ref
0f2d19dd 1248{
3ae3166b 1249 size_t len;
a55c2b68 1250 unsigned long idx;
bd9e24b3 1251
d1ca2c64 1252 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1253
1254 len = scm_i_string_length (str);
1255 if (SCM_LIKELY (len > 0))
1256 idx = scm_to_unsigned_integer (k, 0, len - 1);
1257 else
1258 scm_out_of_range (NULL, k);
1259
9c44cd45
MG
1260 if (scm_i_is_narrow_string (str))
1261 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1262 else
1263 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1264}
1bbd0b84 1265#undef FUNC_NAME
0f2d19dd 1266
3ee86942
MV
1267SCM
1268scm_c_string_ref (SCM str, size_t p)
1269{
1270 if (p >= scm_i_string_length (str))
1271 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1272 if (scm_i_is_narrow_string (str))
1273 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1274 else
1275 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1276
3ee86942 1277}
f0942910 1278
3b3b36dd 1279SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1280 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1281 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1282 "an unspecified value. @var{k} must be a valid index of\n"
1283 "@var{str}.")
1bbd0b84 1284#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1285{
3ae3166b 1286 size_t len;
a55c2b68
MV
1287 unsigned long idx;
1288
f0942910 1289 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1290
1291 len = scm_i_string_length (str);
1292 if (SCM_LIKELY (len > 0))
1293 idx = scm_to_unsigned_integer (k, 0, len - 1);
1294 else
1295 scm_out_of_range (NULL, k);
1296
34d19ef6 1297 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1298 str = scm_i_string_start_writing (str);
1299 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1300 scm_i_string_stop_writing ();
1301
0f2d19dd
JB
1302 return SCM_UNSPECIFIED;
1303}
1bbd0b84 1304#undef FUNC_NAME
0f2d19dd 1305
3ee86942
MV
1306void
1307scm_c_string_set_x (SCM str, size_t p, SCM chr)
1308{
1309 if (p >= scm_i_string_length (str))
1310 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1311 str = scm_i_string_start_writing (str);
1312 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1313 scm_i_string_stop_writing ();
3ee86942 1314}
0f2d19dd 1315
3b3b36dd 1316SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1317 (SCM str, SCM start, SCM end),
1318 "Return a newly allocated string formed from the characters\n"
1319 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1320 "ending with index @var{end} (exclusive).\n"
1321 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1322 "exact integers satisfying:\n\n"
1323 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1324#define FUNC_NAME s_scm_substring
0f2d19dd 1325{
3ee86942 1326 size_t len, from, to;
685c0d71 1327
d1ca2c64 1328 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1329 len = scm_i_string_length (str);
1330 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1331 if (SCM_UNBNDP (end))
3ee86942 1332 to = len;
a55c2b68 1333 else
3ee86942
MV
1334 to = scm_to_unsigned_integer (end, from, len);
1335 return scm_i_substring (str, from, to);
0f2d19dd 1336}
1bbd0b84 1337#undef FUNC_NAME
0f2d19dd 1338
ed35de72
MV
1339SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1340 (SCM str, SCM start, SCM end),
1341 "Return a newly allocated string formed from the characters\n"
1342 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1343 "ending with index @var{end} (exclusive).\n"
1344 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1345 "exact integers satisfying:\n"
1346 "\n"
1347 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1348 "\n"
1349 "The returned string is read-only.\n")
1350#define FUNC_NAME s_scm_substring_read_only
1351{
1352 size_t len, from, to;
1353
1354 SCM_VALIDATE_STRING (1, str);
1355 len = scm_i_string_length (str);
1356 from = scm_to_unsigned_integer (start, 0, len);
1357 if (SCM_UNBNDP (end))
1358 to = len;
1359 else
1360 to = scm_to_unsigned_integer (end, from, len);
1361 return scm_i_substring_read_only (str, from, to);
1362}
1363#undef FUNC_NAME
1364
3ee86942
MV
1365SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1366 (SCM str, SCM start, SCM end),
1367 "Return a newly allocated string formed from the characters\n"
1368 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1369 "ending with index @var{end} (exclusive).\n"
1370 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1371 "exact integers satisfying:\n\n"
1372 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1373#define FUNC_NAME s_scm_substring_copy
1374{
e1b29f6a
MV
1375 /* For the Scheme version, START is mandatory, but for the C
1376 version, it is optional. See scm_string_copy in srfi-13.c for a
1377 rationale.
1378 */
1379
1380 size_t from, to;
3ee86942
MV
1381
1382 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1383 scm_i_get_substring_spec (scm_i_string_length (str),
1384 start, &from, end, &to);
3ee86942
MV
1385 return scm_i_substring_copy (str, from, to);
1386}
1387#undef FUNC_NAME
1388
1389SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1390 (SCM str, SCM start, SCM end),
1391 "Return string that indirectly refers to the characters\n"
1392 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1393 "ending with index @var{end} (exclusive).\n"
1394 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1395 "exact integers satisfying:\n\n"
1396 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1397#define FUNC_NAME s_scm_substring_shared
1398{
1399 size_t len, from, to;
1400
1401 SCM_VALIDATE_STRING (1, str);
1402 len = scm_i_string_length (str);
1403 from = scm_to_unsigned_integer (start, 0, len);
1404 if (SCM_UNBNDP (end))
1405 to = len;
1406 else
1407 to = scm_to_unsigned_integer (end, from, len);
1408 return scm_i_substring_shared (str, from, to);
1409}
1410#undef FUNC_NAME
685c0d71 1411
3b3b36dd 1412SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1413 (SCM args),
9c44cd45 1414 "Return a newly allocated string whose characters form the\n"
0d26a824 1415 "concatenation of the given strings, @var{args}.")
1bbd0b84 1416#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1417{
1418 SCM res;
9c44cd45
MG
1419 size_t len = 0;
1420 int wide = 0;
c829a427 1421 SCM l, s;
bd4911ef 1422 size_t i;
9909c395
MG
1423 union
1424 {
1425 char *narrow;
1426 scm_t_wchar *wide;
1427 } data;
af45e3b0
DH
1428
1429 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1430 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1431 {
1432 s = SCM_CAR (l);
1433 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1434 len += scm_i_string_length (s);
1435 if (!scm_i_is_narrow_string (s))
1436 wide = 1;
c829a427 1437 }
9909c395 1438 data.narrow = NULL;
9c44cd45 1439 if (!wide)
9909c395 1440 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1441 else
9909c395 1442 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1443
1444 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1445 {
edea856c 1446 size_t len;
c829a427 1447 s = SCM_CAR (l);
3ee86942 1448 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1449 len = scm_i_string_length (s);
9c44cd45
MG
1450 if (!wide)
1451 {
9909c395
MG
1452 memcpy (data.narrow, scm_i_string_chars (s), len);
1453 data.narrow += len;
9c44cd45
MG
1454 }
1455 else
1456 {
1457 if (scm_i_is_narrow_string (s))
1458 {
1459 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1460 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1461 }
1462 else
9909c395 1463 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1464 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1465 data.wide += len;
9c44cd45 1466 }
c829a427
MV
1467 scm_remember_upto_here_1 (s);
1468 }
0f2d19dd
JB
1469 return res;
1470}
1bbd0b84 1471#undef FUNC_NAME
0f2d19dd 1472
c829a427
MV
1473int
1474scm_is_string (SCM obj)
1475{
3ee86942 1476 return IS_STRING (obj);
c829a427 1477}
24933780 1478
587a3355
MG
1479static SCM
1480scm_from_stringn (const char *str, size_t len, const char *encoding,
1481 scm_t_string_failed_conversion_handler handler)
1482{
1483 size_t u32len, i;
1484 scm_t_wchar *u32;
1485 int wide = 0;
1486 SCM res;
1487
889975e5
MG
1488 if (encoding == NULL)
1489 {
1490 /* If encoding is null, use Latin-1. */
1491 char *buf;
1492 res = scm_i_make_string (len, &buf);
1493 memcpy (buf, str, len);
1494 return res;
1495 }
1496
587a3355
MG
1497 u32len = 0;
1498 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1499 (enum iconv_ilseq_handler)
1500 handler,
1501 str, len,
1502 NULL,
1503 NULL, &u32len);
1504
1505 if (u32 == NULL)
1506 {
1507 if (errno == ENOMEM)
1508 scm_memory_error ("locale string conversion");
1509 else
1510 {
889975e5 1511 /* There are invalid sequences in the input string. */
587a3355
MG
1512 SCM errstr;
1513 char *dst;
587a3355
MG
1514 errstr = scm_i_make_string (len, &dst);
1515 memcpy (dst, str, len);
1516 scm_misc_error (NULL, "input locale conversion error from ~s: ~s",
1517 scm_list_2 (scm_from_locale_string (encoding),
1518 errstr));
1519 scm_remember_upto_here_1 (errstr);
1520 }
1521 }
1522
1523 i = 0;
1524 while (i < u32len)
1525 if (u32[i++] > 0xFF)
1526 {
1527 wide = 1;
1528 break;
1529 }
1530
1531 if (!wide)
1532 {
1533 char *dst;
1534 res = scm_i_make_string (u32len, &dst);
1535 for (i = 0; i < u32len; i ++)
1536 dst[i] = (unsigned char) u32[i];
1537 dst[u32len] = '\0';
1538 }
1539 else
1540 {
1541 scm_t_wchar *wdst;
1542 res = scm_i_make_wide_string (u32len, &wdst);
1543 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1544 wdst[u32len] = 0;
1545 }
1546
1547 free (u32);
1548 return res;
1549}
1550
889975e5
MG
1551SCM
1552scm_from_locale_stringn (const char *str, size_t len)
1553{
1554 const char *enc;
1555 scm_t_string_failed_conversion_handler hndl;
1556 SCM inport;
1557 scm_t_port *pt;
1558
1559 if (len == (size_t) -1)
1560 len = strlen (str);
1561 if (len == 0)
1562 return scm_nullstr;
1563
1564 inport = scm_current_input_port ();
1565 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1566 {
1567 pt = SCM_PTAB_ENTRY (inport);
1568 enc = pt->encoding;
1569 hndl = pt->ilseq_handler;
1570 }
1571 else
1572 {
1573 enc = NULL;
1574 hndl = SCM_FAILED_CONVERSION_ERROR;
1575 }
1576
1577 return scm_from_stringn (str, len, enc, hndl);
1578}
1579
1580SCM
1581scm_from_locale_string (const char *str)
1582{
1583 if (str == NULL)
1584 return scm_nullstr;
1585
1586 return scm_from_locale_stringn (str, -1);
1587}
1588
587a3355
MG
1589SCM
1590scm_i_from_utf8_string (const scm_t_uint8 *str)
1591{
1592 return scm_from_stringn ((const char *) str,
1593 strlen ((char *) str), "UTF-8",
1594 SCM_FAILED_CONVERSION_ERROR);
1595}
1596
50b1996f
MG
1597/* Create a new scheme string from the C string STR. The memory of
1598 STR may be used directly as storage for the new string. */
c829a427
MV
1599SCM
1600scm_take_locale_stringn (char *str, size_t len)
1601{
48ddf0d9
KR
1602 SCM buf, res;
1603
9c44cd45 1604 if (len == (size_t) -1)
48ddf0d9 1605 len = strlen (str);
c829a427
MV
1606 else
1607 {
48ddf0d9
KR
1608 /* Ensure STR is null terminated. A realloc for 1 extra byte should
1609 often be satisfied from the alignment padding after the block, with
1610 no actual data movement. */
9c44cd45 1611 str = scm_realloc (str, len + 1);
48ddf0d9 1612 str[len] = '\0';
c829a427 1613 }
c829a427 1614
fd0a5bbc 1615 buf = scm_i_take_stringbufn (str, len);
3ee86942 1616 res = scm_double_cell (STRING_TAG,
9c44cd45 1617 SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
c829a427
MV
1618 return res;
1619}
1620
48ddf0d9
KR
1621SCM
1622scm_take_locale_string (char *str)
1623{
1624 return scm_take_locale_stringn (str, -1);
1625}
1626
9c44cd45
MG
1627/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1628 and \UXXXXXX. */
1629static void
1630unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1631{
1632 char *before, *after;
1633 size_t i, j;
1634
1635 before = *bufp;
1636 after = *bufp;
1637 i = 0;
1638 j = 0;
1639 while (i < *lenp)
1640 {
1641 if ((i <= *lenp - 6)
1642 && before[i] == '\\'
1643 && before[i + 1] == 'u'
1644 && before[i + 2] == '0' && before[i + 3] == '0')
1645 {
1646 /* Convert \u00NN to \xNN */
1647 after[j] = '\\';
1648 after[j + 1] = 'x';
30a6b9ca
MG
1649 after[j + 2] = tolower ((int) before[i + 4]);
1650 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1651 i += 6;
1652 j += 4;
1653 }
1654 else if ((i <= *lenp - 10)
1655 && before[i] == '\\'
1656 && before[i + 1] == 'U'
1657 && before[i + 2] == '0' && before[i + 3] == '0')
1658 {
1659 /* Convert \U00NNNNNN to \UNNNNNN */
1660 after[j] = '\\';
1661 after[j + 1] = 'U';
30a6b9ca
MG
1662 after[j + 2] = tolower ((int) before[i + 4]);
1663 after[j + 3] = tolower ((int) before[i + 5]);
1664 after[j + 4] = tolower ((int) before[i + 6]);
1665 after[j + 5] = tolower ((int) before[i + 7]);
1666 after[j + 6] = tolower ((int) before[i + 8]);
1667 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1668 i += 10;
1669 j += 8;
1670 }
1671 else
1672 {
1673 after[j] = before[i];
1674 i++;
1675 j++;
1676 }
1677 }
1678 *lenp = j;
1679 after = scm_realloc (after, j);
1680}
1681
c829a427 1682char *
9c44cd45 1683scm_to_locale_stringn (SCM str, size_t * lenp)
c829a427 1684{
889975e5
MG
1685 SCM outport;
1686 scm_t_port *pt;
9c44cd45
MG
1687 const char *enc;
1688
889975e5
MG
1689 outport = scm_current_output_port ();
1690 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1691 {
1692 pt = SCM_PTAB_ENTRY (outport);
1693 enc = pt->encoding;
1694 }
1695 else
1696 enc = NULL;
9c44cd45 1697
889975e5
MG
1698 return scm_to_stringn (str, lenp,
1699 enc,
1700 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1701}
1702
1703/* Low-level scheme to C string conversion function. */
1704char *
587a3355 1705scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1706 scm_t_string_failed_conversion_handler handler)
9c44cd45 1707{
9c44cd45
MG
1708 char *buf;
1709 size_t ilen, len, i;
889975e5
MG
1710 int ret;
1711 const char *enc;
4d4528e7 1712
3ee86942 1713 if (!scm_is_string (str))
c829a427 1714 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1715 ilen = scm_i_string_length (str);
1716
1717 if (ilen == 0)
1718 {
1719 buf = scm_malloc (1);
1720 buf[0] = '\0';
1721 if (lenp)
1722 *lenp = 0;
1723 return buf;
1724 }
587a3355 1725
c829a427 1726 if (lenp == NULL)
9c44cd45
MG
1727 for (i = 0; i < ilen; i++)
1728 if (scm_i_string_ref (str, i) == '\0')
1729 scm_misc_error (NULL,
1730 "string contains #\\nul character: ~S",
1731 scm_list_1 (str));
1732
889975e5 1733 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1734 {
889975e5
MG
1735 /* If using native Latin-1 encoding, just copy the string
1736 contents. */
9c44cd45
MG
1737 if (lenp)
1738 {
1739 buf = scm_malloc (ilen);
1740 memcpy (buf, scm_i_string_chars (str), ilen);
1741 *lenp = ilen;
1742 return buf;
1743 }
1744 else
1745 {
1746 buf = scm_malloc (ilen + 1);
1747 memcpy (buf, scm_i_string_chars (str), ilen);
1748 buf[ilen] = '\0';
1749 return buf;
1750 }
c829a427 1751 }
9c44cd45 1752
587a3355 1753
9c44cd45
MG
1754 buf = NULL;
1755 len = 0;
889975e5
MG
1756 enc = encoding;
1757 if (enc == NULL)
1758 enc = "ISO-8859-1";
1759 if (scm_i_is_narrow_string (str))
1760 {
1761 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1762 "ISO-8859-1", enc,
1763 (enum iconv_ilseq_handler) handler, NULL,
1764 &buf, &len);
9c44cd45 1765
889975e5
MG
1766 if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1767 unistring_escapes_to_guile_escapes (&buf, &len);
9c44cd45 1768
889975e5
MG
1769 if (ret != 0)
1770 {
1771 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1772 scm_list_2 (scm_from_locale_string (enc),
1773 str));
1774 }
1775 }
1776 else
1777 {
1778 buf = u32_conv_to_encoding (enc,
1779 (enum iconv_ilseq_handler) handler,
1780 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1781 ilen,
1782 NULL,
1783 NULL, &len);
1784 if (buf == NULL)
1785 {
1786 scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1787 scm_list_2 (scm_from_locale_string (enc),
1788 str));
1789 }
1790 }
9c44cd45 1791 if (lenp)
4d4528e7 1792 *lenp = len;
9c44cd45
MG
1793 else
1794 {
1795 buf = scm_realloc (buf, len + 1);
1796 buf[len] = '\0';
1797 }
24933780 1798
c829a427 1799 scm_remember_upto_here_1 (str);
9c44cd45 1800 return buf;
4d4528e7 1801}
af68e5e5 1802
c829a427
MV
1803char *
1804scm_to_locale_string (SCM str)
1805{
1806 return scm_to_locale_stringn (str, NULL);
1807}
af68e5e5 1808
587a3355
MG
1809scm_t_uint8 *
1810scm_i_to_utf8_string (SCM str)
1811{
1812 char *u8str;
1813 u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1814 return (scm_t_uint8 *) u8str;
1815}
1816
c829a427
MV
1817size_t
1818scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1819{
1820 size_t len;
9c44cd45 1821 char *result = NULL;
3ee86942 1822 if (!scm_is_string (str))
c829a427 1823 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1824 result = scm_to_locale_stringn (str, &len);
1825
1826 memcpy (buf, result, (len > max_len) ? max_len : len);
1827 free (result);
1828
c829a427
MV
1829 scm_remember_upto_here_1 (str);
1830 return len;
1831}
af68e5e5 1832
3ee86942
MV
1833/* converts C scm_array of strings to SCM scm_list of strings. */
1834/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1835SCM
3ee86942
MV
1836scm_makfromstrs (int argc, char **argv)
1837{
1838 int i = argc;
1839 SCM lst = SCM_EOL;
1840 if (0 > i)
1841 for (i = 0; argv[i]; i++);
1842 while (i--)
1843 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1844 return lst;
1845}
1846
c829a427
MV
1847/* Return a newly allocated array of char pointers to each of the strings
1848 in args, with a terminating NULL pointer. */
1849
1850char **
1851scm_i_allocate_string_pointers (SCM list)
af68e5e5 1852{
c829a427
MV
1853 char **result;
1854 int len = scm_ilength (list);
1855 int i;
1856
1857 if (len < 0)
1858 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1859
661ae7ab 1860 scm_dynwind_begin (0);
c829a427
MV
1861
1862 result = (char **) scm_malloc ((len + 1) * sizeof (char *));
1863 result[len] = NULL;
661ae7ab 1864 scm_dynwind_unwind_handler (free, result, 0);
c829a427
MV
1865
1866 /* The list might be have been modified in another thread, so
1867 we check LIST before each access.
1868 */
d2e53ed6 1869 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427
MV
1870 {
1871 result[i] = scm_to_locale_string (SCM_CAR (list));
1872 list = SCM_CDR (list);
1873 }
1874
661ae7ab 1875 scm_dynwind_end ();
c829a427 1876 return result;
af68e5e5 1877}
e53cc817 1878
c829a427
MV
1879void
1880scm_i_free_string_pointers (char **pointers)
1881{
1882 int i;
1883
1884 for (i = 0; pointers[i]; i++)
1885 free (pointers[i]);
1886 free (pointers);
1887}
24933780 1888
6f14f578
MV
1889void
1890scm_i_get_substring_spec (size_t len,
1891 SCM start, size_t *cstart,
1892 SCM end, size_t *cend)
1893{
1894 if (SCM_UNBNDP (start))
1895 *cstart = 0;
1896 else
1897 *cstart = scm_to_unsigned_integer (start, 0, len);
1898
1899 if (SCM_UNBNDP (end))
1900 *cend = len;
1901 else
1902 *cend = scm_to_unsigned_integer (end, *cstart, len);
1903}
1904
3ee86942
MV
1905#if SCM_ENABLE_DEPRECATED
1906
556d75db
MV
1907/* When these definitions are removed, it becomes reasonable to use
1908 read-only strings for string literals. For that, change the reader
1909 to create string literals with scm_c_substring_read_only instead of
1910 with scm_c_substring_copy.
1911*/
1912
3ee86942 1913int
fe78c51a 1914scm_i_deprecated_stringp (SCM str)
3ee86942
MV
1915{
1916 scm_c_issue_deprecation_warning
1917 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1918
2616f0e0 1919 return scm_is_string (str);
3ee86942
MV
1920}
1921
1922char *
fe78c51a 1923scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
1924{
1925 char *chars;
1926
1927 scm_c_issue_deprecation_warning
1928 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1929
2616f0e0
MV
1930 /* We don't accept shared substrings here since they are not
1931 null-terminated.
1932 */
1933 if (IS_SH_STRING (str))
1934 scm_misc_error (NULL,
1935 "SCM_STRING_CHARS does not work with shared substrings.",
1936 SCM_EOL);
1937
877f06c3 1938 /* We explicitly test for read-only strings to produce a better
556d75db
MV
1939 error message.
1940 */
1941
1942 if (IS_RO_STRING (str))
1943 scm_misc_error (NULL,
1944 "SCM_STRING_CHARS does not work with read-only strings.",
1945 SCM_EOL);
1946
2616f0e0 1947 /* The following is still wrong, of course...
3ee86942 1948 */
9c44cd45 1949 str = scm_i_string_start_writing (str);
3ee86942
MV
1950 chars = scm_i_string_writable_chars (str);
1951 scm_i_string_stop_writing ();
1952 return chars;
1953}
1954
1955size_t
fe78c51a 1956scm_i_deprecated_string_length (SCM str)
3ee86942
MV
1957{
1958 scm_c_issue_deprecation_warning
1959 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1960 return scm_c_string_length (str);
1961}
1962
1963#endif
1964
0f2d19dd
JB
1965void
1966scm_init_strings ()
0f2d19dd 1967{
3ee86942 1968 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 1969
a0599745 1970#include "libguile/strings.x"
0f2d19dd
JB
1971}
1972
89e00824
ML
1973
1974/*
1975 Local Variables:
1976 c-file-style: "gnu"
1977 End:
1978*/