Fix thread-unsafe lazy initializations.
[bpt/guile.git] / libguile / strings.c
CommitLineData
a7e392c1 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
faf2c9d7 32
889975e5
MG
33#include "striconveh.h"
34
a0599745
MD
35#include "libguile/_scm.h"
36#include "libguile/chars.h"
7c33806a 37#include "libguile/root.h"
a0599745 38#include "libguile/strings.h"
a3d7d5d5 39#include "libguile/error.h"
f45eccff 40#include "libguile/generalized-vectors.h"
1afff620 41#include "libguile/deprecation.h"
a0599745 42#include "libguile/validate.h"
d31b9519 43#include "libguile/private-options.h"
1afff620 44
0f2d19dd
JB
45\f
46
47/* {Strings}
48 */
49
3ee86942
MV
50
51/* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 55 * shared, not whether it definitely is.
3ee86942
MV
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
50b1996f 68 *
ba54a202
LC
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
3ee86942
MV
72 */
73
ba54a202
LC
74/* The size in words of the stringbuf header (type tag + size). */
75#define STRINGBUF_HEADER_SIZE 2U
76
77#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
35920c00 79#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 80#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
81
82#define STRINGBUF_TAG scm_tc7_stringbuf
83#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 84#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 85
100e20c7 86#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
100e20c7
LC
89#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 91
100e20c7 92#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 93
9b41542f
LC
94#define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
3ee86942 103
56a3dcd4 104#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
105static size_t lenhist[1001];
106#endif
107
50b1996f
MG
108/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
3ee86942
MV
110static SCM
111make_stringbuf (size_t len)
0f2d19dd 112{
3ee86942
MV
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
ba54a202
LC
120 SCM buf;
121
56a3dcd4 122#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127#endif
0f2d19dd 128
ba54a202
LC
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
3ee86942 138}
e53cc817 139
50b1996f
MG
140/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
9c44cd45
MG
142static SCM
143make_wide_stringbuf (size_t len)
144{
ba54a202
LC
145 SCM buf;
146 size_t raw_len;
147
56a3dcd4 148#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153#endif
154
ba54a202
LC
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
9c44cd45
MG
165}
166
ba54a202
LC
167/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169static SCM
170wide_stringbuf (SCM buf)
9c44cd45 171{
ba54a202 172 SCM new_buf;
9c44cd45
MG
173
174 if (STRINGBUF_WIDE (buf))
ba54a202
LC
175 new_buf = buf;
176 else
9c44cd45 177 {
ba54a202
LC
178 size_t i, len;
179 scm_t_wchar *mem;
9c44cd45 180
ba54a202 181 len = STRINGBUF_LENGTH (buf);
9c44cd45 182
ba54a202 183 new_buf = make_wide_stringbuf (len);
9c44cd45 184
ba54a202 185 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 186 for (i = 0; i < len; i++)
ba54a202 187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 188 mem[len] = 0;
9c44cd45 189 }
ba54a202
LC
190
191 return new_buf;
3ee86942 192}
bd9e24b3 193
ba54a202
LC
194/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196static SCM
587a3355
MG
197narrow_stringbuf (SCM buf)
198{
ba54a202 199 SCM new_buf;
587a3355
MG
200
201 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
587a3355 208
ba54a202
LC
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 211
ba54a202
LC
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
587a3355 216
ba54a202 217 new_buf = make_stringbuf (len);
587a3355 218
ba54a202
LC
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
587a3355
MG
226}
227
9de87eea 228scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 229
ba54a202 230\f
3ee86942
MV
231/* Copy-on-write strings.
232 */
bd9e24b3 233
3ee86942 234#define STRING_TAG scm_tc7_string
bd9e24b3 235
3ee86942
MV
236#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 239
3ee86942
MV
240#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
ed35de72
MV
245/* Read-only strings.
246 */
247
35920c00 248#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
249#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
e1b29f6a
MV
251/* Mutation-sharing substrings
252 */
253
254#define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257/* START and LENGTH as for STRINGs. */
258
259#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
e7efe8e7
AW
261SCM scm_nullstr;
262
60617d81
MW
263static SCM null_stringbuf;
264
265static void
266init_null_stringbuf (void)
267{
268 null_stringbuf = make_stringbuf (0);
269 SET_STRINGBUF_SHARED (null_stringbuf);
270}
271
50b1996f
MG
272/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
273 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
274 char array. If READ_ONLY_P, the returned string is read-only;
275 otherwise it is writable. */
3ee86942 276SCM
190d4b0d 277scm_i_make_string (size_t len, char **charsp, int read_only_p)
3ee86942 278{
69cd5299 279 SCM buf;
3ee86942 280 SCM res;
69cd5299
MW
281
282 if (len == 0)
283 {
60617d81
MW
284 static scm_i_pthread_once_t once = SCM_I_PTHREAD_ONCE_INIT;
285 scm_i_pthread_once (&once, init_null_stringbuf);
69cd5299
MW
286 buf = null_stringbuf;
287 }
288 else
289 buf = make_stringbuf (len);
290
3ee86942 291 if (charsp)
f59cf998 292 *charsp = (char *) STRINGBUF_CHARS (buf);
190d4b0d
LC
293 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
294 SCM_UNPACK (buf),
295 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 296 return res;
0f2d19dd
JB
297}
298
50b1996f
MG
299/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
300 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
301 character array. If READ_ONLY_P, the returned string is read-only;
302 otherwise it is writable. */
9c44cd45 303SCM
190d4b0d 304scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
9c44cd45
MG
305{
306 SCM buf = make_wide_stringbuf (len);
307 SCM res;
308 if (charsp)
309 *charsp = STRINGBUF_WIDE_CHARS (buf);
190d4b0d
LC
310 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
311 SCM_UNPACK (buf),
9c44cd45
MG
312 (scm_t_bits) 0, (scm_t_bits) len);
313 return res;
314}
315
3ee86942
MV
316static void
317validate_substring_args (SCM str, size_t start, size_t end)
318{
319 if (!IS_STRING (str))
320 scm_wrong_type_arg_msg (NULL, 0, str, "string");
321 if (start > STRING_LENGTH (str))
322 scm_out_of_range (NULL, scm_from_size_t (start));
323 if (end > STRING_LENGTH (str) || end < start)
324 scm_out_of_range (NULL, scm_from_size_t (end));
325}
0f2d19dd 326
e1b29f6a
MV
327static inline void
328get_str_buf_start (SCM *str, SCM *buf, size_t *start)
329{
330 *start = STRING_START (*str);
331 if (IS_SH_STRING (*str))
332 {
333 *str = SH_STRING_STRING (*str);
334 *start += STRING_START (*str);
335 }
336 *buf = STRING_STRINGBUF (*str);
337}
338
3ee86942
MV
339SCM
340scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 341{
17bec545
MW
342 if (start == end)
343 return scm_i_make_string (0, NULL, 0);
344 else
345 {
346 SCM buf;
347 size_t str_start;
348 get_str_buf_start (&str, &buf, &str_start);
349 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
350 SET_STRINGBUF_SHARED (buf);
351 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
352 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
353 (scm_t_bits)str_start + start,
354 (scm_t_bits) end - start);
355 }
0f2d19dd
JB
356}
357
ed35de72
MV
358SCM
359scm_i_substring_read_only (SCM str, size_t start, size_t end)
360{
17bec545
MW
361 if (start == end)
362 return scm_i_make_string (0, NULL, 1);
363 else
364 {
365 SCM buf;
366 size_t str_start;
367 get_str_buf_start (&str, &buf, &str_start);
368 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
369 SET_STRINGBUF_SHARED (buf);
370 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
371 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
372 (scm_t_bits)str_start + start,
373 (scm_t_bits) end - start);
374 }
ed35de72
MV
375}
376
3ee86942
MV
377SCM
378scm_i_substring_copy (SCM str, size_t start, size_t end)
379{
d5b75b6c
MW
380 if (start == end)
381 return scm_i_make_string (0, NULL, 0);
9c44cd45
MG
382 else
383 {
d5b75b6c
MW
384 size_t len = end - start;
385 SCM buf, my_buf, substr;
386 size_t str_start;
387 int wide = 0;
388 get_str_buf_start (&str, &buf, &str_start);
389 if (scm_i_is_narrow_string (str))
390 {
391 my_buf = make_stringbuf (len);
392 memcpy (STRINGBUF_CHARS (my_buf),
393 STRINGBUF_CHARS (buf) + str_start + start, len);
394 }
395 else
396 {
397 my_buf = make_wide_stringbuf (len);
398 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
399 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
400 + start), len);
401 wide = 1;
402 }
403 scm_remember_upto_here_1 (buf);
404 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
405 (scm_t_bits) 0, (scm_t_bits) len);
406 if (wide)
407 scm_i_try_narrow_string (substr);
408 return substr;
9c44cd45 409 }
3ee86942 410}
0f2d19dd 411
e1b29f6a
MV
412SCM
413scm_i_substring_shared (SCM str, size_t start, size_t end)
414{
415 if (start == 0 && end == STRING_LENGTH (str))
416 return str;
17bec545
MW
417 else if (start == end)
418 return scm_i_make_string (0, NULL, 0);
419 else
e1b29f6a
MV
420 {
421 size_t len = end - start;
422 if (IS_SH_STRING (str))
423 {
424 start += STRING_START (str);
425 str = SH_STRING_STRING (str);
426 }
427 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
428 (scm_t_bits)start, (scm_t_bits) len);
429 }
430}
431
3ee86942
MV
432SCM
433scm_c_substring (SCM str, size_t start, size_t end)
434{
435 validate_substring_args (str, start, end);
436 return scm_i_substring (str, start, end);
437}
ee149d03 438
ed35de72
MV
439SCM
440scm_c_substring_read_only (SCM str, size_t start, size_t end)
441{
442 validate_substring_args (str, start, end);
443 return scm_i_substring_read_only (str, start, end);
444}
445
0f2d19dd 446SCM
3ee86942 447scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 448{
3ee86942
MV
449 validate_substring_args (str, start, end);
450 return scm_i_substring_copy (str, start, end);
451}
452
3ee86942
MV
453SCM
454scm_c_substring_shared (SCM str, size_t start, size_t end)
455{
456 validate_substring_args (str, start, end);
457 return scm_i_substring_shared (str, start, end);
458}
0f2d19dd 459
d6c74168 460\f
3ee86942
MV
461/* Internal accessors
462 */
463
50b1996f
MG
464/* Returns the number of characters in STR. This may be different
465 than the memory size of the string storage. */
3ee86942
MV
466size_t
467scm_i_string_length (SCM str)
0f2d19dd 468{
3ee86942 469 return STRING_LENGTH (str);
0f2d19dd
JB
470}
471
50b1996f
MG
472/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
473 encoding. False if it is 'wide', having a 32-bit UCS-4
474 encoding. */
9c44cd45
MG
475int
476scm_i_is_narrow_string (SCM str)
477{
49d09292
MW
478 if (IS_SH_STRING (str))
479 str = SH_STRING_STRING (str);
480
9c44cd45
MG
481 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
482}
483
587a3355
MG
484/* Try to coerce a string to be narrow. It if is narrow already, do
485 nothing. If it is wide, shrink it to narrow if none of its
486 characters are above 0xFF. Return true if the string is narrow or
487 was made to be narrow. */
488int
489scm_i_try_narrow_string (SCM str)
490{
49d09292
MW
491 if (IS_SH_STRING (str))
492 str = SH_STRING_STRING (str);
493
ba54a202 494 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
495
496 return scm_i_is_narrow_string (str);
497}
498
100e20c7
LC
499/* Return a pointer to the raw data of the string, which can be either Latin-1
500 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
501const void *
502scm_i_string_data (SCM str)
503{
504 SCM buf;
505 size_t start;
506 const char *data;
507
508 get_str_buf_start (&str, &buf, &start);
509
510 data = STRINGBUF_CONTENTS (buf);
511 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
512
513 return data;
514}
515
50b1996f
MG
516/* Returns a pointer to the 8-bit Latin-1 encoded character array of
517 STR. */
3ee86942
MV
518const char *
519scm_i_string_chars (SCM str)
520{
521 SCM buf;
e1b29f6a
MV
522 size_t start;
523 get_str_buf_start (&str, &buf, &start);
9c44cd45 524 if (scm_i_is_narrow_string (str))
f59cf998 525 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
526 else
527 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
528 scm_list_1 (str));
529 return NULL;
3ee86942 530}
b00418df 531
50b1996f
MG
532/* Returns a pointer to the 32-bit UCS-4 encoded character array of
533 STR. */
9c44cd45
MG
534const scm_t_wchar *
535scm_i_string_wide_chars (SCM str)
536{
537 SCM buf;
538 size_t start;
539
540 get_str_buf_start (&str, &buf, &start);
541 if (!scm_i_is_narrow_string (str))
f59cf998 542 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
543 else
544 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
545 scm_list_1 (str));
546}
547
548/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
549 a new string buffer, so that it can be modified without modifying
50b1996f
MG
550 other strings. Also, lock the string mutex. Later, one must call
551 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
552SCM
553scm_i_string_start_writing (SCM orig_str)
b00418df 554{
ed35de72 555 SCM buf, str = orig_str;
e1b29f6a 556 size_t start;
ed35de72 557
e1b29f6a 558 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
559 if (IS_RO_STRING (str))
560 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
561
9de87eea 562 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
563 if (STRINGBUF_SHARED (buf))
564 {
9c44cd45 565 /* Clone the stringbuf. */
3ee86942
MV
566 size_t len = STRING_LENGTH (str);
567 SCM new_buf;
568
9de87eea 569 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 570
9c44cd45
MG
571 if (scm_i_is_narrow_string (str))
572 {
573 new_buf = make_stringbuf (len);
574 memcpy (STRINGBUF_CHARS (new_buf),
575 STRINGBUF_CHARS (buf) + STRING_START (str), len);
576
577 }
578 else
579 {
580 new_buf = make_wide_stringbuf (len);
581 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
582 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
583 + STRING_START (str)), len);
584 }
3ee86942 585
3ee86942
MV
586 SET_STRING_STRINGBUF (str, new_buf);
587 start -= STRING_START (str);
902578f1
LC
588
589 /* FIXME: The following operations are not atomic, so other threads
590 looking at STR may see an inconsistent state. Nevertheless it can't
591 hurt much since (i) accessing STR while it is being mutated can't
592 yield a crash, and (ii) concurrent accesses to STR should be
593 protected by a mutex at the application level. The latter may not
594 apply when STR != ORIG_STR, though. */
3ee86942 595 SET_STRING_START (str, 0);
902578f1 596 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
597
598 buf = new_buf;
599
9de87eea 600 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 601 }
9c44cd45
MG
602 return orig_str;
603}
3ee86942 604
50b1996f 605/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
606char *
607scm_i_string_writable_chars (SCM str)
608{
609 SCM buf;
610 size_t start;
3ee86942 611
9c44cd45
MG
612 get_str_buf_start (&str, &buf, &start);
613 if (scm_i_is_narrow_string (str))
f59cf998 614 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
615 else
616 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
617 scm_list_1 (str));
618 return NULL;
b00418df
DH
619}
620
50b1996f 621/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
622static scm_t_wchar *
623scm_i_string_writable_wide_chars (SCM str)
624{
625 SCM buf;
626 size_t start;
627
628 get_str_buf_start (&str, &buf, &start);
629 if (!scm_i_is_narrow_string (str))
630 return STRINGBUF_WIDE_CHARS (buf) + start;
631 else
1c7b216f 632 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 633 scm_list_1 (str));
b00418df
DH
634}
635
50b1996f
MG
636/* Unlock the string mutex that was locked when
637 scm_i_string_start_writing was called. */
3ee86942
MV
638void
639scm_i_string_stop_writing (void)
640{
9de87eea 641 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 642}
b00418df 643
50b1996f 644/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
645scm_t_wchar
646scm_i_string_ref (SCM str, size_t x)
647{
648 if (scm_i_is_narrow_string (str))
649 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
650 else
651 return scm_i_string_wide_chars (str)[x];
652}
653
889975e5
MG
654/* Returns index+1 of the first char in STR that matches C, or
655 0 if the char is not found. */
656int
657scm_i_string_contains_char (SCM str, char ch)
658{
659 size_t i;
660 size_t len = scm_i_string_length (str);
661
662 i = 0;
663 if (scm_i_is_narrow_string (str))
664 {
665 while (i < len)
666 {
667 if (scm_i_string_chars (str)[i] == ch)
668 return i+1;
669 i++;
670 }
671 }
672 else
673 {
674 while (i < len)
675 {
676 if (scm_i_string_wide_chars (str)[i]
677 == (unsigned char) ch)
678 return i+1;
679 i++;
680 }
681 }
682 return 0;
683}
684
3f47e526
MG
685int
686scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
687{
688 if (scm_i_is_narrow_string (sstr))
689 {
690 const char *a = scm_i_string_chars (sstr) + start_x;
691 const char *b = cstr;
692 return strncmp (a, b, strlen(b));
693 }
694 else
695 {
696 size_t i;
697 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
698 const char *b = cstr;
699 for (i = 0; i < strlen (b); i++)
700 {
701 if (a[i] != (unsigned char) b[i])
702 return 1;
703 }
704 }
705 return 0;
706}
707
50b1996f 708/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
709void
710scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
711{
49d09292
MW
712 if (IS_SH_STRING (str))
713 {
714 p += STRING_START (str);
715 str = SH_STRING_STRING (str);
716 }
717
9c44cd45 718 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 719 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
720
721 if (scm_i_is_narrow_string (str))
722 {
723 char *dst = scm_i_string_writable_chars (str);
587a3355 724 dst[p] = chr;
9c44cd45
MG
725 }
726 else
727 {
728 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
729 dst[p] = chr;
730 }
731}
732
ba54a202 733\f
3ee86942 734/* Symbols.
587a3355 735
3ee86942
MV
736 Basic symbol creation and accessing is done here, the rest is in
737 symbols.[hc]. This has been done to keep stringbufs and the
738 internals of strings and string-like objects confined to this file.
739*/
740
741#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
742
743SCM
6869328b
MV
744scm_i_make_symbol (SCM name, scm_t_bits flags,
745 unsigned long hash, SCM props)
3ee86942
MV
746{
747 SCM buf;
748 size_t start = STRING_START (name);
749 size_t length = STRING_LENGTH (name);
750
751 if (IS_SH_STRING (name))
752 {
753 name = SH_STRING_STRING (name);
754 start += STRING_START (name);
755 }
1a4d7653 756 buf = STRING_STRINGBUF (name);
3ee86942
MV
757
758 if (start == 0 && length == STRINGBUF_LENGTH (buf))
759 {
760 /* reuse buf. */
9de87eea 761 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 762 SET_STRINGBUF_SHARED (buf);
9de87eea 763 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
764 }
765 else
766 {
767 /* make new buf. */
9c44cd45
MG
768 if (scm_i_is_narrow_string (name))
769 {
770 SCM new_buf = make_stringbuf (length);
771 memcpy (STRINGBUF_CHARS (new_buf),
772 STRINGBUF_CHARS (buf) + start, length);
773 buf = new_buf;
774 }
775 else
776 {
777 SCM new_buf = make_wide_stringbuf (length);
778 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
779 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
780 length);
781 buf = new_buf;
782 }
3ee86942 783 }
6869328b 784 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
785 (scm_t_bits) hash, SCM_UNPACK (props));
786}
787
fd0a5bbc
HWN
788SCM
789scm_i_c_make_symbol (const char *name, size_t len,
790 scm_t_bits flags, unsigned long hash, SCM props)
791{
792 SCM buf = make_stringbuf (len);
793 memcpy (STRINGBUF_CHARS (buf), name, len);
794
65619ebe
AW
795 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
796 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
797}
798
50b1996f
MG
799/* Returns the number of characters in SYM. This may be different
800 from the memory size of SYM. */
3ee86942
MV
801size_t
802scm_i_symbol_length (SCM sym)
0f2d19dd 803{
3ee86942 804 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
805}
806
071bb6a8
LC
807size_t
808scm_c_symbol_length (SCM sym)
809#define FUNC_NAME "scm_c_symbol_length"
810{
811 SCM_VALIDATE_SYMBOL (1, sym);
812
813 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
814}
815#undef FUNC_NAME
816
50b1996f
MG
817/* True if the name of SYM is stored as a Latin-1 encoded string.
818 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
819int
820scm_i_is_narrow_symbol (SCM sym)
821{
822 SCM buf;
823
824 buf = SYMBOL_STRINGBUF (sym);
825 return !STRINGBUF_WIDE (buf);
826}
827
50b1996f
MG
828/* Returns a pointer to the 8-bit Latin-1 encoded character array that
829 contains the name of SYM. */
3ee86942
MV
830const char *
831scm_i_symbol_chars (SCM sym)
832{
9c44cd45
MG
833 SCM buf;
834
835 buf = SYMBOL_STRINGBUF (sym);
836 if (!STRINGBUF_WIDE (buf))
f59cf998 837 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
838 else
839 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
840 scm_list_1 (sym));
841}
842
50b1996f
MG
843/* Return a pointer to the 32-bit UCS-4-encoded character array of a
844 symbol's name. */
9c44cd45
MG
845const scm_t_wchar *
846scm_i_symbol_wide_chars (SCM sym)
847{
848 SCM buf;
849
850 buf = SYMBOL_STRINGBUF (sym);
851 if (STRINGBUF_WIDE (buf))
f59cf998 852 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
853 else
854 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
855 scm_list_1 (sym));
3ee86942 856}
1cc91f1b 857
be54b15d 858SCM
3ee86942 859scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 860{
3ee86942 861 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 862 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 863 SET_STRINGBUF_SHARED (buf);
9de87eea 864 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 865 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
866 (scm_t_bits)start, (scm_t_bits) end - start);
867}
be54b15d 868
50b1996f 869/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
870scm_t_wchar
871scm_i_symbol_ref (SCM sym, size_t x)
872{
873 if (scm_i_is_narrow_symbol (sym))
874 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
875 else
876 return scm_i_symbol_wide_chars (sym)[x];
877}
878
3ee86942
MV
879/* Debugging
880 */
be54b15d 881
6ce6923b
MG
882SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
883 "Returns an association list containing debugging information\n"
884 "for @var{str}. The association list has the following entries."
885 "@table @code\n"
886 "@item string\n"
887 "The string itself.\n"
888 "@item start\n"
889 "The start index of the string into its stringbuf\n"
890 "@item length\n"
891 "The length of the string\n"
892 "@item shared\n"
893 "If this string is a substring, it returns its parent string.\n"
894 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
895 "@item read-only\n"
896 "@code{#t} if the string is read-only\n"
6ce6923b
MG
897 "@item stringbuf-chars\n"
898 "A new string containing this string's stringbuf's characters\n"
899 "@item stringbuf-length\n"
900 "The number of characters in this stringbuf\n"
901 "@item stringbuf-shared\n"
902 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
903 "@item stringbuf-wide\n"
904 "@code{#t} if this stringbuf's characters are stored in a\n"
905 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
906 "buffer\n"
907 "@end table")
3ee86942
MV
908#define FUNC_NAME s_scm_sys_string_dump
909{
ba54a202 910 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 911 SCM buf;
3ee86942 912 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
913
914 /* String info */
4a655e50 915 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 916 str);
4a655e50 917 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 918 scm_from_size_t (STRING_START (str)));
4a655e50 919 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
920 scm_from_size_t (STRING_LENGTH (str)));
921
3ee86942
MV
922 if (IS_SH_STRING (str))
923 {
4a655e50 924 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
925 SH_STRING_STRING (str));
926 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
927 }
928 else
929 {
4a655e50 930 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
931 SCM_BOOL_F);
932 buf = STRING_STRINGBUF (str);
3ee86942 933 }
9c44cd45 934
88ed5759 935 if (IS_RO_STRING (str))
4a655e50 936 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
937 SCM_BOOL_T);
938 else
4a655e50 939 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 940 SCM_BOOL_F);
587a3355 941
6ce6923b 942 /* Stringbuf info */
6ce6923b
MG
943 if (!STRINGBUF_WIDE (buf))
944 {
945 size_t len = STRINGBUF_LENGTH (buf);
946 char *cbuf;
190d4b0d 947 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 948 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 949 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 950 sbc);
3ee86942 951 }
6ce6923b
MG
952 else
953 {
954 size_t len = STRINGBUF_LENGTH (buf);
955 scm_t_wchar *cbuf;
190d4b0d 956 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
957 u32_cpy ((scm_t_uint32 *) cbuf,
958 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 959 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
960 sbc);
961 }
4a655e50 962 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
963 scm_from_size_t (STRINGBUF_LENGTH (buf)));
964 if (STRINGBUF_SHARED (buf))
4a655e50 965 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
966 SCM_BOOL_T);
967 else
4a655e50 968 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 969 SCM_BOOL_F);
6ce6923b 970 if (STRINGBUF_WIDE (buf))
4a655e50 971 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 972 SCM_BOOL_T);
6ce6923b 973 else
4a655e50 974 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 975 SCM_BOOL_F);
6ce6923b 976
ba54a202 977 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
978}
979#undef FUNC_NAME
980
6ce6923b
MG
981SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
982 "Returns an association list containing debugging information\n"
983 "for @var{sym}. The association list has the following entries."
984 "@table @code\n"
985 "@item symbol\n"
986 "The symbol itself\n"
987 "@item hash\n"
988 "Its hash value\n"
88ed5759
MG
989 "@item interned\n"
990 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
991 "@item stringbuf-chars\n"
992 "A new string containing this symbols's stringbuf's characters\n"
993 "@item stringbuf-length\n"
994 "The number of characters in this stringbuf\n"
995 "@item stringbuf-shared\n"
996 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
997 "@item stringbuf-wide\n"
998 "@code{#t} if this stringbuf's characters are stored in a\n"
999 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1000 "buffer\n"
1001 "@end table")
3ee86942
MV
1002#define FUNC_NAME s_scm_sys_symbol_dump
1003{
ba54a202 1004 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 1005 SCM buf;
3ee86942 1006 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 1007 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 1008 sym);
4a655e50 1009 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 1010 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 1011 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 1012 scm_symbol_interned_p (sym));
6ce6923b
MG
1013 buf = SYMBOL_STRINGBUF (sym);
1014
1015 /* Stringbuf info */
6ce6923b
MG
1016 if (!STRINGBUF_WIDE (buf))
1017 {
1018 size_t len = STRINGBUF_LENGTH (buf);
1019 char *cbuf;
190d4b0d 1020 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 1021 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 1022 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1023 sbc);
1024 }
9c44cd45 1025 else
6ce6923b
MG
1026 {
1027 size_t len = STRINGBUF_LENGTH (buf);
1028 scm_t_wchar *cbuf;
190d4b0d 1029 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
1030 u32_cpy ((scm_t_uint32 *) cbuf,
1031 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 1032 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1033 sbc);
1034 }
4a655e50 1035 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
1036 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1037 if (STRINGBUF_SHARED (buf))
4a655e50 1038 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
1039 SCM_BOOL_T);
1040 else
4a655e50 1041 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 1042 SCM_BOOL_F);
6ce6923b 1043 if (STRINGBUF_WIDE (buf))
4a655e50 1044 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
1045 SCM_BOOL_T);
1046 else
4a655e50 1047 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 1048 SCM_BOOL_F);
ba54a202 1049 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 1050
3ee86942
MV
1051}
1052#undef FUNC_NAME
1053
56a3dcd4 1054#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1055
9c44cd45 1056SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1057#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1058{
1059 int i;
1060 for (i = 0; i < 1000; i++)
1061 if (lenhist[i])
1062 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1063 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1064 return SCM_UNSPECIFIED;
be54b15d
DH
1065}
1066#undef FUNC_NAME
1067
3ee86942
MV
1068#endif
1069
1070\f
1071
1072SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1073 (SCM obj),
1074 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1075#define FUNC_NAME s_scm_string_p
1076{
1077 return scm_from_bool (IS_STRING (obj));
1078}
1079#undef FUNC_NAME
1080
1081
1082SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1083
1084SCM_DEFINE (scm_string, "string", 0, 0, 1,
1085 (SCM chrs),
1086 "@deffnx {Scheme Procedure} list->string chrs\n"
1087 "Return a newly allocated string composed of the arguments,\n"
1088 "@var{chrs}.")
1089#define FUNC_NAME s_scm_string
1090{
9aa27c1a 1091 SCM result = SCM_BOOL_F;
9c44cd45 1092 SCM rest;
3ee86942 1093 size_t len;
9c44cd45
MG
1094 size_t p = 0;
1095 long i;
9aa27c1a 1096 int wide = 0;
3ee86942 1097
9c44cd45
MG
1098 /* Verify that this is a list of chars. */
1099 i = scm_ilength (chrs);
3c7cf7f5 1100 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1101
9c44cd45
MG
1102 len = (size_t) i;
1103 rest = chrs;
3ee86942 1104
9c44cd45 1105 while (len > 0 && scm_is_pair (rest))
3ee86942 1106 {
9c44cd45 1107 SCM elt = SCM_CAR (rest);
3ee86942 1108 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1109 if (SCM_CHAR (elt) > 0xFF)
1110 wide = 1;
9c44cd45
MG
1111 rest = SCM_CDR (rest);
1112 len--;
1113 scm_remember_upto_here_1 (elt);
1114 }
1115
1116 /* Construct a string containing this list of chars. */
1117 len = (size_t) i;
1118 rest = chrs;
1119
9aa27c1a 1120 if (wide == 0)
9c44cd45 1121 {
56a3dcd4
LC
1122 char *buf;
1123
190d4b0d 1124 result = scm_i_make_string (len, NULL, 0);
9aa27c1a 1125 result = scm_i_string_start_writing (result);
56a3dcd4 1126 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1127 while (len > 0 && scm_is_pair (rest))
1128 {
1129 SCM elt = SCM_CAR (rest);
1130 buf[p] = (unsigned char) SCM_CHAR (elt);
1131 p++;
1132 rest = SCM_CDR (rest);
1133 len--;
1134 scm_remember_upto_here_1 (elt);
1135 }
1136 }
1137 else
1138 {
56a3dcd4
LC
1139 scm_t_wchar *buf;
1140
190d4b0d 1141 result = scm_i_make_wide_string (len, NULL, 0);
9aa27c1a 1142 result = scm_i_string_start_writing (result);
56a3dcd4 1143 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1144 while (len > 0 && scm_is_pair (rest))
1145 {
1146 SCM elt = SCM_CAR (rest);
1147 buf[p] = SCM_CHAR (elt);
1148 p++;
1149 rest = SCM_CDR (rest);
1150 len--;
1151 scm_remember_upto_here_1 (elt);
1152 }
3ee86942 1153 }
9c44cd45
MG
1154 scm_i_string_stop_writing ();
1155
3ee86942
MV
1156 if (len > 0)
1157 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1158 if (!scm_is_null (rest))
3ee86942
MV
1159 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1160
1161 return result;
1162}
1163#undef FUNC_NAME
be54b15d 1164
3b3b36dd 1165SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1166 (SCM k, SCM chr),
0d26a824
MG
1167 "Return a newly allocated string of\n"
1168 "length @var{k}. If @var{chr} is given, then all elements of\n"
1169 "the string are initialized to @var{chr}, otherwise the contents\n"
b7e64f8b 1170 "of the string are all set to @code{#\nul}.")
1bbd0b84 1171#define FUNC_NAME s_scm_make_string
0f2d19dd 1172{
3ee86942
MV
1173 return scm_c_make_string (scm_to_size_t (k), chr);
1174}
1175#undef FUNC_NAME
1176
1177SCM
1178scm_c_make_string (size_t len, SCM chr)
1179#define FUNC_NAME NULL
1180{
9c44cd45 1181 size_t p;
3ef6650d 1182 char *contents = NULL;
190d4b0d 1183 SCM res = scm_i_make_string (len, &contents, 0);
cb0d8be2 1184
3ef6650d
AW
1185 /* If no char is given, initialize string contents to NULL. */
1186 if (SCM_UNBNDP (chr))
1187 memset (contents, 0, len);
1188 else
e11e83f3 1189 {
3ee86942 1190 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1191 res = scm_i_string_start_writing (res);
1192 for (p = 0; p < len; p++)
1193 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1194 scm_i_string_stop_writing ();
0f2d19dd 1195 }
e11e83f3
MV
1196
1197 return res;
0f2d19dd 1198}
1bbd0b84 1199#undef FUNC_NAME
0f2d19dd 1200
3b3b36dd 1201SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1202 (SCM string),
1203 "Return the number of characters in @var{string}.")
1bbd0b84 1204#define FUNC_NAME s_scm_string_length
0f2d19dd 1205{
d1ca2c64 1206 SCM_VALIDATE_STRING (1, string);
3ee86942 1207 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1208}
1bbd0b84 1209#undef FUNC_NAME
0f2d19dd 1210
f8ba2bb9 1211SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1212 (SCM string),
1213 "Return the bytes used to represent a character in @var{string}."
1214 "This will return 1 or 4.")
f8ba2bb9 1215#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1216{
1217 SCM_VALIDATE_STRING (1, string);
1218 if (!scm_i_is_narrow_string (string))
1219 return scm_from_int (4);
1220
1221 return scm_from_int (1);
1222}
1223#undef FUNC_NAME
1224
3ee86942
MV
1225size_t
1226scm_c_string_length (SCM string)
1227{
1228 if (!IS_STRING (string))
1229 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1230 return STRING_LENGTH (string);
1231}
1232
bd9e24b3 1233SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1234 (SCM str, SCM k),
9c44cd45
MG
1235 "Return character @var{k} of @var{str} using zero-origin\n"
1236 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1237#define FUNC_NAME s_scm_string_ref
0f2d19dd 1238{
3ae3166b 1239 size_t len;
a55c2b68 1240 unsigned long idx;
bd9e24b3 1241
d1ca2c64 1242 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1243
1244 len = scm_i_string_length (str);
1245 if (SCM_LIKELY (len > 0))
1246 idx = scm_to_unsigned_integer (k, 0, len - 1);
1247 else
1248 scm_out_of_range (NULL, k);
1249
9c44cd45
MG
1250 if (scm_i_is_narrow_string (str))
1251 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1252 else
1253 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1254}
1bbd0b84 1255#undef FUNC_NAME
0f2d19dd 1256
3ee86942
MV
1257SCM
1258scm_c_string_ref (SCM str, size_t p)
1259{
1260 if (p >= scm_i_string_length (str))
1261 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1262 if (scm_i_is_narrow_string (str))
1263 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1264 else
1265 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1266
3ee86942 1267}
f0942910 1268
3b3b36dd 1269SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1270 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1271 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1272 "an unspecified value. @var{k} must be a valid index of\n"
1273 "@var{str}.")
1bbd0b84 1274#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1275{
3ae3166b 1276 size_t len;
a55c2b68
MV
1277 unsigned long idx;
1278
f0942910 1279 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1280
1281 len = scm_i_string_length (str);
1282 if (SCM_LIKELY (len > 0))
1283 idx = scm_to_unsigned_integer (k, 0, len - 1);
1284 else
1285 scm_out_of_range (NULL, k);
1286
34d19ef6 1287 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1288 str = scm_i_string_start_writing (str);
1289 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1290 scm_i_string_stop_writing ();
1291
0f2d19dd
JB
1292 return SCM_UNSPECIFIED;
1293}
1bbd0b84 1294#undef FUNC_NAME
0f2d19dd 1295
3ee86942
MV
1296void
1297scm_c_string_set_x (SCM str, size_t p, SCM chr)
1298{
1299 if (p >= scm_i_string_length (str))
1300 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1301 str = scm_i_string_start_writing (str);
1302 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1303 scm_i_string_stop_writing ();
3ee86942 1304}
0f2d19dd 1305
3b3b36dd 1306SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1307 (SCM str, SCM start, SCM end),
1308 "Return a newly allocated string formed from the characters\n"
1309 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1310 "ending with index @var{end} (exclusive).\n"
1311 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1312 "exact integers satisfying:\n\n"
1313 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1314#define FUNC_NAME s_scm_substring
0f2d19dd 1315{
3ee86942 1316 size_t len, from, to;
685c0d71 1317
d1ca2c64 1318 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1319 len = scm_i_string_length (str);
1320 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1321 if (SCM_UNBNDP (end))
3ee86942 1322 to = len;
a55c2b68 1323 else
3ee86942
MV
1324 to = scm_to_unsigned_integer (end, from, len);
1325 return scm_i_substring (str, from, to);
0f2d19dd 1326}
1bbd0b84 1327#undef FUNC_NAME
0f2d19dd 1328
ed35de72
MV
1329SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1330 (SCM str, SCM start, SCM end),
1331 "Return a newly allocated string formed from the characters\n"
1332 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1333 "ending with index @var{end} (exclusive).\n"
1334 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1335 "exact integers satisfying:\n"
1336 "\n"
1337 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1338 "\n"
1339 "The returned string is read-only.\n")
1340#define FUNC_NAME s_scm_substring_read_only
1341{
1342 size_t len, from, to;
1343
1344 SCM_VALIDATE_STRING (1, str);
1345 len = scm_i_string_length (str);
1346 from = scm_to_unsigned_integer (start, 0, len);
1347 if (SCM_UNBNDP (end))
1348 to = len;
1349 else
1350 to = scm_to_unsigned_integer (end, from, len);
1351 return scm_i_substring_read_only (str, from, to);
1352}
1353#undef FUNC_NAME
1354
3ee86942
MV
1355SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1356 (SCM str, SCM start, SCM end),
1357 "Return a newly allocated string formed from the characters\n"
1358 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1359 "ending with index @var{end} (exclusive).\n"
1360 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1361 "exact integers satisfying:\n\n"
1362 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1363#define FUNC_NAME s_scm_substring_copy
1364{
e1b29f6a
MV
1365 /* For the Scheme version, START is mandatory, but for the C
1366 version, it is optional. See scm_string_copy in srfi-13.c for a
1367 rationale.
1368 */
1369
1370 size_t from, to;
3ee86942
MV
1371
1372 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1373 scm_i_get_substring_spec (scm_i_string_length (str),
1374 start, &from, end, &to);
3ee86942
MV
1375 return scm_i_substring_copy (str, from, to);
1376}
1377#undef FUNC_NAME
1378
1379SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1380 (SCM str, SCM start, SCM end),
1381 "Return string that indirectly refers to the characters\n"
1382 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1383 "ending with index @var{end} (exclusive).\n"
1384 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1385 "exact integers satisfying:\n\n"
1386 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1387#define FUNC_NAME s_scm_substring_shared
1388{
1389 size_t len, from, to;
1390
1391 SCM_VALIDATE_STRING (1, str);
1392 len = scm_i_string_length (str);
1393 from = scm_to_unsigned_integer (start, 0, len);
1394 if (SCM_UNBNDP (end))
1395 to = len;
1396 else
1397 to = scm_to_unsigned_integer (end, from, len);
1398 return scm_i_substring_shared (str, from, to);
1399}
1400#undef FUNC_NAME
685c0d71 1401
3b3b36dd 1402SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1403 (SCM args),
9c44cd45 1404 "Return a newly allocated string whose characters form the\n"
0d26a824 1405 "concatenation of the given strings, @var{args}.")
1bbd0b84 1406#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1407{
1408 SCM res;
2f13a466
MW
1409 size_t total = 0;
1410 size_t len;
9c44cd45 1411 int wide = 0;
c829a427 1412 SCM l, s;
bd4911ef 1413 size_t i;
9909c395
MG
1414 union
1415 {
1416 char *narrow;
1417 scm_t_wchar *wide;
1418 } data;
af45e3b0
DH
1419
1420 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1421 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1422 {
1423 s = SCM_CAR (l);
1424 SCM_VALIDATE_STRING (SCM_ARGn, s);
2f13a466
MW
1425 len = scm_i_string_length (s);
1426 if (((size_t) -1) - total < len)
1427 scm_num_overflow (s_scm_string_append);
1428 total += len;
9c44cd45
MG
1429 if (!scm_i_is_narrow_string (s))
1430 wide = 1;
c829a427 1431 }
9909c395 1432 data.narrow = NULL;
9c44cd45 1433 if (!wide)
2f13a466 1434 res = scm_i_make_string (total, &data.narrow, 0);
9c44cd45 1435 else
2f13a466 1436 res = scm_i_make_wide_string (total, &data.wide, 0);
9c44cd45
MG
1437
1438 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1439 {
edea856c 1440 size_t len;
c829a427 1441 s = SCM_CAR (l);
3ee86942 1442 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1443 len = scm_i_string_length (s);
2f13a466
MW
1444 if (len > total)
1445 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
9c44cd45
MG
1446 if (!wide)
1447 {
9909c395
MG
1448 memcpy (data.narrow, scm_i_string_chars (s), len);
1449 data.narrow += len;
9c44cd45
MG
1450 }
1451 else
1452 {
1453 if (scm_i_is_narrow_string (s))
1454 {
2f13a466
MW
1455 const char *src = scm_i_string_chars (s);
1456 for (i = 0; i < len; i++)
1457 data.wide[i] = (unsigned char) src[i];
9c44cd45
MG
1458 }
1459 else
9909c395 1460 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1461 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1462 data.wide += len;
9c44cd45 1463 }
2f13a466 1464 total -= len;
c829a427
MV
1465 scm_remember_upto_here_1 (s);
1466 }
2f13a466
MW
1467 if (total != 0)
1468 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
0f2d19dd
JB
1469 return res;
1470}
1bbd0b84 1471#undef FUNC_NAME
0f2d19dd 1472
24933780 1473
a3d7d5d5 1474\f
c62da8f8 1475/* Charset conversion error handling. */
a3d7d5d5
LC
1476
1477SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
c62da8f8
LC
1478SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1479
6851d3be
LC
1480/* Raise an exception informing that character CHR could not be written
1481 to PORT in its current encoding. */
d14418a5 1482void
ef7e4ba3 1483scm_encoding_error (const char *subr, int err, const char *message,
6851d3be 1484 SCM port, SCM chr)
ef7e4ba3 1485{
ef7e4ba3 1486 scm_throw (scm_encoding_error_key,
d050ef66
AW
1487 scm_list_n (scm_from_latin1_string (subr),
1488 scm_from_latin1_string (message),
ef7e4ba3 1489 scm_from_int (err),
6851d3be 1490 port, chr,
ef7e4ba3 1491 SCM_UNDEFINED));
a3d7d5d5
LC
1492}
1493
c62da8f8
LC
1494/* Raise an exception informing of an encoding error on PORT. This
1495 means that a character could not be written in PORT's encoding. */
1496void
1497scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1498{
1499 scm_throw (scm_decoding_error_key,
d050ef66
AW
1500 scm_list_n (scm_from_latin1_string (subr),
1501 scm_from_latin1_string (message),
c62da8f8
LC
1502 scm_from_int (err),
1503 port,
1504 SCM_UNDEFINED));
1505}
1506
1507\f
1508/* String conversion to/from C. */
1509
fac32b51 1510SCM
587a3355
MG
1511scm_from_stringn (const char *str, size_t len, const char *encoding,
1512 scm_t_string_failed_conversion_handler handler)
1513{
1514 size_t u32len, i;
1515 scm_t_wchar *u32;
1516 int wide = 0;
1517 SCM res;
1518
d40e1ca8 1519 /* The order of these checks is important. */
a574564c 1520 if (!str && len != 0)
d40e1ca8
AW
1521 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1522 if (len == (size_t) -1)
1523 len = strlen (str);
fac32b51 1524
75321259 1525 if (encoding == NULL || len == 0)
889975e5 1526 {
75321259 1527 /* If encoding is null (or the string is empty), use Latin-1. */
889975e5 1528 char *buf;
190d4b0d 1529 res = scm_i_make_string (len, &buf, 0);
889975e5
MG
1530 memcpy (buf, str, len);
1531 return res;
1532 }
1533
587a3355
MG
1534 u32len = 0;
1535 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1536 (enum iconv_ilseq_handler)
1537 handler,
1538 str, len,
1539 NULL,
1540 NULL, &u32len);
1541
ef7e4ba3 1542 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1543 {
ef7e4ba3
LC
1544 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1545 handler. */
1546 SCM bv;
1547 signed char *buf;
1548
1549 buf = scm_gc_malloc_pointerless (len, "bytevector");
1550 memcpy (buf, str, len);
fb031aba 1551 bv = scm_c_take_gc_bytevector (buf, len);
ef7e4ba3 1552
c62da8f8
LC
1553 scm_decoding_error (__func__, errno,
1554 "input locale conversion error", bv);
587a3355
MG
1555 }
1556
1557 i = 0;
1558 while (i < u32len)
1559 if (u32[i++] > 0xFF)
1560 {
1561 wide = 1;
1562 break;
1563 }
1564
1565 if (!wide)
1566 {
1567 char *dst;
190d4b0d 1568 res = scm_i_make_string (u32len, &dst, 0);
587a3355
MG
1569 for (i = 0; i < u32len; i ++)
1570 dst[i] = (unsigned char) u32[i];
1571 dst[u32len] = '\0';
1572 }
1573 else
1574 {
1575 scm_t_wchar *wdst;
190d4b0d 1576 res = scm_i_make_wide_string (u32len, &wdst, 0);
587a3355
MG
1577 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1578 wdst[u32len] = 0;
1579 }
1580
1581 free (u32);
1582 return res;
1583}
1584
cf313a94 1585SCM
d40e1ca8 1586scm_from_locale_string (const char *str)
cf313a94 1587{
d40e1ca8 1588 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1589}
1590
c829a427
MV
1591SCM
1592scm_from_locale_stringn (const char *str, size_t len)
1593{
95f5e303 1594 return scm_from_stringn (str, len, locale_charset (),
b22e94db 1595 scm_i_default_port_conversion_handler ());
c829a427 1596}
4d4528e7 1597
c829a427 1598SCM
d40e1ca8 1599scm_from_latin1_string (const char *str)
4d4528e7 1600{
d40e1ca8
AW
1601 return scm_from_latin1_stringn (str, -1);
1602}
9c44cd45 1603
d40e1ca8
AW
1604SCM
1605scm_from_latin1_stringn (const char *str, size_t len)
1606{
e9a35a96
LC
1607 char *buf;
1608 SCM result;
1609
1610 if (len == (size_t) -1)
1611 len = strlen (str);
1612
1613 /* Make a narrow string and copy STR as is. */
190d4b0d 1614 result = scm_i_make_string (len, &buf, 0);
e9a35a96
LC
1615 memcpy (buf, str, len);
1616
1617 return result;
c829a427 1618}
4d4528e7 1619
587a3355 1620SCM
d40e1ca8 1621scm_from_utf8_string (const char *str)
587a3355 1622{
d40e1ca8
AW
1623 return scm_from_utf8_stringn (str, -1);
1624}
1625
1626SCM
1627scm_from_utf8_stringn (const char *str, size_t len)
1628{
1629 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
587a3355
MG
1630}
1631
647dc1ac
LC
1632SCM
1633scm_from_utf32_string (const scm_t_wchar *str)
1634{
1635 return scm_from_utf32_stringn (str, -1);
1636}
1637
1638SCM
1639scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1640{
1641 SCM result;
1642 scm_t_wchar *buf;
1643
1644 if (len == (size_t) -1)
1645 len = u32_strlen ((uint32_t *) str);
1646
190d4b0d 1647 result = scm_i_make_wide_string (len, &buf, 0);
647dc1ac
LC
1648 memcpy (buf, str, len * sizeof (scm_t_wchar));
1649 scm_i_try_narrow_string (result);
1650
1651 return result;
1652}
1653
50b1996f
MG
1654/* Create a new scheme string from the C string STR. The memory of
1655 STR may be used directly as storage for the new string. */
13a94556
LC
1656/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1657 would be to register a finalizer to eventually free(3) STR, which isn't
1658 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1659SCM
1660scm_take_locale_stringn (char *str, size_t len)
1661{
13a94556 1662 SCM res;
48ddf0d9 1663
13a94556
LC
1664 res = scm_from_locale_stringn (str, len);
1665 free (str);
c829a427 1666
c829a427
MV
1667 return res;
1668}
1669
48ddf0d9
KR
1670SCM
1671scm_take_locale_string (char *str)
1672{
1673 return scm_take_locale_stringn (str, -1);
1674}
1675
f1ee6d54
LC
1676/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1677 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1678 Set *LENP to the size of the resulting string.
1679
1680 FIXME: This is a hack we should get rid of. See
1681 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1682 for details. */
1683static void
1684unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1685{
1686 char *before, *after;
1687 size_t i, j;
1688
4ff2b9f4
LC
1689 before = buf;
1690 after = buf;
9c44cd45
MG
1691 i = 0;
1692 j = 0;
1693 while (i < *lenp)
1694 {
1695 if ((i <= *lenp - 6)
1696 && before[i] == '\\'
1697 && before[i + 1] == 'u'
1698 && before[i + 2] == '0' && before[i + 3] == '0')
1699 {
1700 /* Convert \u00NN to \xNN */
1701 after[j] = '\\';
1702 after[j + 1] = 'x';
30a6b9ca
MG
1703 after[j + 2] = tolower ((int) before[i + 4]);
1704 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1705 i += 6;
1706 j += 4;
1707 }
1708 else if ((i <= *lenp - 10)
1709 && before[i] == '\\'
1710 && before[i + 1] == 'U'
1711 && before[i + 2] == '0' && before[i + 3] == '0')
1712 {
1713 /* Convert \U00NNNNNN to \UNNNNNN */
1714 after[j] = '\\';
1715 after[j + 1] = 'U';
30a6b9ca
MG
1716 after[j + 2] = tolower ((int) before[i + 4]);
1717 after[j + 3] = tolower ((int) before[i + 5]);
1718 after[j + 4] = tolower ((int) before[i + 6]);
1719 after[j + 5] = tolower ((int) before[i + 7]);
1720 after[j + 6] = tolower ((int) before[i + 8]);
1721 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1722 i += 10;
1723 j += 8;
1724 }
1725 else
1726 {
1727 after[j] = before[i];
1728 i++;
1729 j++;
1730 }
1731 }
1732 *lenp = j;
9c44cd45
MG
1733}
1734
f1ee6d54
LC
1735/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1736 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1737 of the resulting string. BUF must be large enough to handle the
1738 worst case when `\uXXXX' escapes (6 characters) are replaced by
1739 `\xXXXX;' (7 characters). */
31d4d02b
LC
1740static void
1741unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1742{
1743 char *before, *after;
1744 size_t i, j;
1745 /* The worst case is if the input string contains all 4-digit hex escapes.
1746 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1747 size_t max_out_len = (*lenp * 7) / 6 + 1;
1748 size_t nzeros, ndigits;
1749
4ff2b9f4 1750 before = buf;
d31b9519
MG
1751 after = alloca (max_out_len);
1752 i = 0;
1753 j = 0;
1754 while (i < *lenp)
1755 {
1756 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1757 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1758 {
1759 if (before[i + 1] == 'u')
1760 ndigits = 4;
1761 else if (before[i + 1] == 'U')
1762 ndigits = 8;
1763 else
1764 abort ();
1765
1766 /* Add the R6RS hex escape initial sequence. */
1767 after[j] = '\\';
1768 after[j + 1] = 'x';
1769
1770 /* Move string positions to the start of the hex numbers. */
1771 i += 2;
1772 j += 2;
1773
1774 /* Find the number of initial zeros in this hex number. */
1775 nzeros = 0;
1776 while (before[i + nzeros] == '0' && nzeros < ndigits)
1777 nzeros++;
1778
1779 /* Copy the number, skipping initial zeros, and then move the string
1780 positions. */
1781 if (nzeros == ndigits)
1782 {
1783 after[j] = '0';
1784 i += ndigits;
1785 j += 1;
1786 }
1787 else
1788 {
1789 int pos;
1790 for (pos = 0; pos < ndigits - nzeros; pos++)
1791 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1792 i += ndigits;
1793 j += (ndigits - nzeros);
1794 }
1795
1796 /* Add terminating semicolon. */
1797 after[j] = ';';
1798 j++;
1799 }
1800 else
1801 {
1802 after[j] = before[i];
1803 i++;
1804 j++;
1805 }
1806 }
1807 *lenp = j;
d31b9519
MG
1808 memcpy (before, after, j);
1809}
1810
cf313a94 1811char *
d40e1ca8 1812scm_to_locale_string (SCM str)
cf313a94 1813{
d40e1ca8 1814 return scm_to_locale_stringn (str, NULL);
cf313a94 1815}
d31b9519 1816
c829a427 1817char *
fac32b51 1818scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1819{
b22e94db 1820 return scm_to_stringn (str, lenp,
95f5e303 1821 locale_charset (),
b22e94db 1822 scm_i_default_port_conversion_handler ());
9c44cd45
MG
1823}
1824
d40e1ca8
AW
1825char *
1826scm_to_latin1_string (SCM str)
1827{
1828 return scm_to_latin1_stringn (str, NULL);
1829}
1830
1831char *
1832scm_to_latin1_stringn (SCM str, size_t *lenp)
e9a35a96 1833#define FUNC_NAME "scm_to_latin1_stringn"
d40e1ca8 1834{
e9a35a96
LC
1835 char *result;
1836
1837 SCM_VALIDATE_STRING (1, str);
1838
1839 if (scm_i_is_narrow_string (str))
1840 {
fe133640
AW
1841 size_t len = scm_i_string_length (str);
1842
e9a35a96 1843 if (lenp)
fe133640 1844 *lenp = len;
e9a35a96 1845
fe133640 1846 result = scm_strndup (scm_i_string_data (str), len);
e9a35a96
LC
1847 }
1848 else
1849 result = scm_to_stringn (str, lenp, NULL,
fe133640 1850 SCM_FAILED_CONVERSION_ERROR);
e9a35a96
LC
1851
1852 return result;
d40e1ca8 1853}
e9a35a96 1854#undef FUNC_NAME
d40e1ca8
AW
1855
1856char *
1857scm_to_utf8_string (SCM str)
1858{
1859 return scm_to_utf8_stringn (str, NULL);
1860}
1861
e3d45974
AW
1862static size_t
1863latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1864{
1865 size_t ret, i;
1866 for (i = 0, ret = 0; i < len; i++)
1867 ret += (str[i] < 128) ? 1 : 2;
1868 return ret;
1869}
1870
1871static scm_t_uint8*
1872latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1873 scm_t_uint8 *u8_result, size_t *u8_lenp)
1874{
1875 size_t i, n;
1876 size_t u8_len = latin1_u8_strlen (str, latin_len);
1877
1878 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1879 u8_result = scm_malloc (u8_len + 1);
1880 if (u8_lenp)
1881 *u8_lenp = u8_len;
1882
1883 for (i = 0, n = 0; i < latin_len; i++)
1884 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1885 if (n != u8_len)
1886 abort ();
1887 u8_result[n] = 0;
1888
1889 return u8_result;
1890}
1891
e26da7a2
MW
1892/* UTF-8 code table
1893
1894 (Note that this includes code points that are not allowed by Unicode,
1895 but since this function has no way to report an error, and its
1896 purpose is to determine the size of destination buffers for
1897 libunicode conversion functions, we err on the safe side and handle
1898 everything that libunicode might conceivably handle, now or in the
1899 future.)
2c1b7951
AW
1900
1901 Char. number range | UTF-8 octet sequence
1902 (hexadecimal) | (binary)
e26da7a2 1903 --------------------+------------------------------------------------------
2c1b7951
AW
1904 0000 0000-0000 007F | 0xxxxxxx
1905 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
1906 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
e26da7a2
MW
1907 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
1908 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
1909 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2c1b7951
AW
1910*/
1911
1912static size_t
e26da7a2 1913u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2c1b7951
AW
1914{
1915 size_t ret, i;
1916
1917 for (i = 0, ret = 0; i < len; i++)
1918 {
1919 scm_t_uint32 c = str[i];
1920
1921 if (c <= 0x7f)
1922 ret += 1;
1923 else if (c <= 0x7ff)
1924 ret += 2;
1925 else if (c <= 0xffff)
1926 ret += 3;
e26da7a2 1927 else if (c <= 0x1fffff)
2c1b7951 1928 ret += 4;
e26da7a2
MW
1929 else if (c <= 0x3ffffff)
1930 ret += 5;
1931 else
1932 ret += 6;
2c1b7951
AW
1933 }
1934
1935 return ret;
1936}
1937
d40e1ca8
AW
1938char *
1939scm_to_utf8_stringn (SCM str, size_t *lenp)
ee26a9eb 1940#define FUNC_NAME "scm_to_utf8_stringn"
d40e1ca8 1941{
ee26a9eb
AW
1942 SCM_VALIDATE_STRING (1, str);
1943
e3d45974
AW
1944 if (scm_i_is_narrow_string (str))
1945 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
1946 scm_i_string_length (str),
1947 NULL, lenp);
1948 else
2c1b7951 1949 {
bbb9f000 1950 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2c1b7951 1951 scm_t_uint8 *buf, *ret;
bbb9f000
MW
1952 size_t num_chars = scm_i_string_length (str);
1953 size_t num_bytes_predicted, num_bytes_actual;
2c1b7951 1954
bbb9f000 1955 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2c1b7951 1956
e26da7a2 1957 if (lenp)
2c1b7951 1958 {
bbb9f000
MW
1959 *lenp = num_bytes_predicted;
1960 buf = scm_malloc (num_bytes_predicted);
2c1b7951 1961 }
e26da7a2
MW
1962 else
1963 {
bbb9f000
MW
1964 buf = scm_malloc (num_bytes_predicted + 1);
1965 buf[num_bytes_predicted] = 0;
e26da7a2
MW
1966 }
1967
bbb9f000
MW
1968 num_bytes_actual = num_bytes_predicted;
1969 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
e26da7a2 1970
bbb9f000 1971 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
e26da7a2 1972 return (char *) ret;
2c1b7951
AW
1973
1974 /* An error: a bad codepoint. */
1975 {
1976 int saved_errno = errno;
1977
1978 free (buf);
1979 if (!saved_errno)
1980 abort ();
1981
1982 scm_decoding_error ("scm_to_utf8_stringn", errno,
1983 "invalid codepoint in string", str);
1984
1985 /* Not reached. */
1986 return NULL;
1987 }
1988 }
d40e1ca8 1989}
ee26a9eb 1990#undef FUNC_NAME
d40e1ca8 1991
647dc1ac
LC
1992scm_t_wchar *
1993scm_to_utf32_string (SCM str)
1994{
1995 return scm_to_utf32_stringn (str, NULL);
1996}
1997
1998scm_t_wchar *
1999scm_to_utf32_stringn (SCM str, size_t *lenp)
2000#define FUNC_NAME "scm_to_utf32_stringn"
2001{
2002 scm_t_wchar *result;
2003
2004 SCM_VALIDATE_STRING (1, str);
2005
2006 if (scm_i_is_narrow_string (str))
e3d45974
AW
2007 {
2008 scm_t_uint8 *codepoints;
2009 size_t i, len;
2010
2011 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2012 len = scm_i_string_length (str);
2013 if (lenp)
2014 *lenp = len;
2015
2016 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2017 for (i = 0; i < len; i++)
2018 result[i] = codepoints[i];
2019 result[len] = 0;
2020 }
647dc1ac
LC
2021 else
2022 {
2023 size_t len;
2024
2025 len = scm_i_string_length (str);
2026 if (lenp)
2027 *lenp = len;
2028
2029 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2030 memcpy (result, scm_i_string_wide_chars (str),
2031 len * sizeof (scm_t_wchar));
2032 result[len] = 0;
2033 }
2034
2035 return result;
2036}
2037#undef FUNC_NAME
2038
29bcdbb0
LC
2039/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2040 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2041 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2042 defined by HANDLER. */
9c44cd45 2043char *
587a3355 2044scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 2045 scm_t_string_failed_conversion_handler handler)
9c44cd45 2046{
9c44cd45
MG
2047 char *buf;
2048 size_t ilen, len, i;
889975e5
MG
2049 int ret;
2050 const char *enc;
4d4528e7 2051
3ee86942 2052 if (!scm_is_string (str))
c829a427 2053 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2054 ilen = scm_i_string_length (str);
2055
2056 if (ilen == 0)
2057 {
2058 buf = scm_malloc (1);
2059 buf[0] = '\0';
2060 if (lenp)
2061 *lenp = 0;
2062 return buf;
2063 }
587a3355 2064
c829a427 2065 if (lenp == NULL)
9c44cd45
MG
2066 for (i = 0; i < ilen; i++)
2067 if (scm_i_string_ref (str, i) == '\0')
2068 scm_misc_error (NULL,
2069 "string contains #\\nul character: ~S",
2070 scm_list_1 (str));
2071
889975e5 2072 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 2073 {
889975e5
MG
2074 /* If using native Latin-1 encoding, just copy the string
2075 contents. */
9c44cd45
MG
2076 if (lenp)
2077 {
2078 buf = scm_malloc (ilen);
2079 memcpy (buf, scm_i_string_chars (str), ilen);
2080 *lenp = ilen;
2081 return buf;
2082 }
2083 else
2084 {
2085 buf = scm_malloc (ilen + 1);
2086 memcpy (buf, scm_i_string_chars (str), ilen);
2087 buf[ilen] = '\0';
2088 return buf;
2089 }
c829a427 2090 }
9c44cd45 2091
587a3355 2092
9c44cd45
MG
2093 buf = NULL;
2094 len = 0;
889975e5
MG
2095 enc = encoding;
2096 if (enc == NULL)
2097 enc = "ISO-8859-1";
2098 if (scm_i_is_narrow_string (str))
2099 {
2100 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2101 "ISO-8859-1", enc,
2102 (enum iconv_ilseq_handler) handler, NULL,
2103 &buf, &len);
9c44cd45 2104
889975e5 2105 if (ret != 0)
ef7e4ba3 2106 scm_encoding_error (__func__, errno,
6851d3be
LC
2107 "cannot convert narrow string to output locale",
2108 SCM_BOOL_F,
2109 /* FIXME: Faulty character unknown. */
2110 SCM_BOOL_F);
889975e5
MG
2111 }
2112 else
2113 {
d31b9519 2114 buf = u32_conv_to_encoding (enc,
889975e5 2115 (enum iconv_ilseq_handler) handler,
d31b9519 2116 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
2117 ilen,
2118 NULL,
2119 NULL, &len);
2120 if (buf == NULL)
ef7e4ba3 2121 scm_encoding_error (__func__, errno,
6851d3be
LC
2122 "cannot convert wide string to output locale",
2123 SCM_BOOL_F,
2124 /* FIXME: Faulty character unknown. */
2125 SCM_BOOL_F);
d31b9519
MG
2126 }
2127 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2128 {
2129 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
2130 {
2131 /* The worst case is if the input string contains all 4-digit
2132 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2133 (seven characters). Make BUF large enough to hold
2134 that. */
2135 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 2136 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 2137 }
d31b9519 2138 else
31d4d02b 2139 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
2140
2141 buf = scm_realloc (buf, len);
889975e5 2142 }
9c44cd45 2143 if (lenp)
4d4528e7 2144 *lenp = len;
9c44cd45
MG
2145 else
2146 {
2147 buf = scm_realloc (buf, len + 1);
2148 buf[len] = '\0';
2149 }
24933780 2150
c829a427 2151 scm_remember_upto_here_1 (str);
9c44cd45 2152 return buf;
4d4528e7 2153}
af68e5e5 2154
c829a427
MV
2155size_t
2156scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2157{
2158 size_t len;
9c44cd45 2159 char *result = NULL;
3ee86942 2160 if (!scm_is_string (str))
c829a427 2161 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2162 result = scm_to_locale_stringn (str, &len);
2163
2164 memcpy (buf, result, (len > max_len) ? max_len : len);
2165 free (result);
2166
c829a427
MV
2167 scm_remember_upto_here_1 (str);
2168 return len;
2169}
af68e5e5 2170
a3d7d5d5
LC
2171\f
2172/* Unicode string normalization. */
2173
edb7bb47
JG
2174/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2175 libguile/i18n.c. It would be useful to have this factored out into a more
2176 convenient location, but its use of alloca makes that tricky to do. */
2177
2178static SCM
2179normalize_str (SCM string, uninorm_t form)
2180{
2181 SCM ret;
2182 scm_t_uint32 *w_str;
2183 scm_t_wchar *cbuf;
2184 size_t rlen, len = scm_i_string_length (string);
2185
2186 if (scm_i_is_narrow_string (string))
2187 {
2188 size_t i;
2189 const char *buf = scm_i_string_chars (string);
2190
2191 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2192
2193 for (i = 0; i < len; i ++)
2194 w_str[i] = (unsigned char) buf[i];
2195 w_str[len] = 0;
2196 }
d8164b04
JG
2197 else
2198 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2199
edb7bb47
JG
2200 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2201
190d4b0d 2202 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
edb7bb47
JG
2203 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2204 free (w_str);
d8164b04
JG
2205
2206 scm_i_try_narrow_string (ret);
2207
edb7bb47
JG
2208 return ret;
2209}
2210
2211SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2212 (SCM string),
2213 "Returns the NFC normalized form of @var{string}.")
2214#define FUNC_NAME s_scm_string_normalize_nfc
2215{
2216 SCM_VALIDATE_STRING (1, string);
2217 return normalize_str (string, UNINORM_NFC);
2218}
2219#undef FUNC_NAME
2220
2221SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2222 (SCM string),
2223 "Returns the NFD normalized form of @var{string}.")
2224#define FUNC_NAME s_scm_string_normalize_nfd
2225{
2226 SCM_VALIDATE_STRING (1, string);
2227 return normalize_str (string, UNINORM_NFD);
2228}
2229#undef FUNC_NAME
2230
2231SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2232 (SCM string),
2233 "Returns the NFKC normalized form of @var{string}.")
2234#define FUNC_NAME s_scm_string_normalize_nfkc
2235{
2236 SCM_VALIDATE_STRING (1, string);
2237 return normalize_str (string, UNINORM_NFKC);
2238}
2239#undef FUNC_NAME
2240
2241SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2242 (SCM string),
2243 "Returns the NFKD normalized form of @var{string}.")
2244#define FUNC_NAME s_scm_string_normalize_nfkd
2245{
2246 SCM_VALIDATE_STRING (1, string);
2247 return normalize_str (string, UNINORM_NFKD);
2248}
2249#undef FUNC_NAME
2250
7505c6e0
MW
2251/* converts C scm_array of strings to SCM scm_list of strings.
2252 If argc < 0, a null terminated scm_array is assumed.
2253 The current locale encoding is assumed */
9c44cd45 2254SCM
3ee86942
MV
2255scm_makfromstrs (int argc, char **argv)
2256{
2257 int i = argc;
2258 SCM lst = SCM_EOL;
2259 if (0 > i)
2260 for (i = 0; argv[i]; i++);
2261 while (i--)
2262 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2263 return lst;
2264}
2265
c829a427 2266/* Return a newly allocated array of char pointers to each of the strings
7505c6e0
MW
2267 in args, with a terminating NULL pointer. The strings are encoded using
2268 the current locale. */
c829a427
MV
2269
2270char **
2271scm_i_allocate_string_pointers (SCM list)
2a776823 2272#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 2273{
c829a427 2274 char **result;
7505c6e0 2275 int list_len = scm_ilength (list);
c829a427
MV
2276 int i;
2277
7505c6e0 2278 if (list_len < 0)
c829a427
MV
2279 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2280
7505c6e0 2281 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2a776823 2282 "string pointers");
7505c6e0 2283 result[list_len] = NULL;
c829a427 2284
7505c6e0 2285 /* The list might have been modified in another thread, so
c829a427
MV
2286 we check LIST before each access.
2287 */
7505c6e0 2288 for (i = 0; i < list_len && scm_is_pair (list); i++)
c829a427 2289 {
7505c6e0
MW
2290 SCM str = SCM_CAR (list);
2291 size_t len; /* String length in bytes */
2292 char *c_str = scm_to_locale_stringn (str, &len);
2293
2294 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2295 scm_malloc to allocate the returned string, which must be
2296 explicitly deallocated. This forces us to copy the string a
2297 second time into a new buffer. Ideally there would be variants
2298 of scm_to_*_stringn that can return garbage-collected buffers. */
2299
2300 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2301 memcpy (result[i], c_str, len);
2a776823 2302 result[i][len] = '\0';
7505c6e0 2303 free (c_str);
2a776823 2304
c829a427
MV
2305 list = SCM_CDR (list);
2306 }
2307
c829a427 2308 return result;
af68e5e5 2309}
2a776823 2310#undef FUNC_NAME
24933780 2311
6f14f578
MV
2312void
2313scm_i_get_substring_spec (size_t len,
2314 SCM start, size_t *cstart,
2315 SCM end, size_t *cend)
2316{
2317 if (SCM_UNBNDP (start))
2318 *cstart = 0;
2319 else
2320 *cstart = scm_to_unsigned_integer (start, 0, len);
2321
2322 if (SCM_UNBNDP (end))
2323 *cend = len;
2324 else
2325 *cend = scm_to_unsigned_integer (end, *cstart, len);
2326}
2327
3ee86942
MV
2328#if SCM_ENABLE_DEPRECATED
2329
556d75db
MV
2330/* When these definitions are removed, it becomes reasonable to use
2331 read-only strings for string literals. For that, change the reader
2332 to create string literals with scm_c_substring_read_only instead of
2333 with scm_c_substring_copy.
2334*/
2335
3ee86942 2336int
fe78c51a 2337scm_i_deprecated_stringp (SCM str)
3ee86942
MV
2338{
2339 scm_c_issue_deprecation_warning
2340 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2341
2616f0e0 2342 return scm_is_string (str);
3ee86942
MV
2343}
2344
2345char *
fe78c51a 2346scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2347{
2348 char *chars;
2349
2350 scm_c_issue_deprecation_warning
2351 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2352
2616f0e0
MV
2353 /* We don't accept shared substrings here since they are not
2354 null-terminated.
2355 */
2356 if (IS_SH_STRING (str))
c291b588
LC
2357 scm_misc_error (NULL,
2358 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2359 SCM_EOL);
2360
877f06c3 2361 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2362 error message.
2363 */
2364
2365 if (IS_RO_STRING (str))
c291b588
LC
2366 scm_misc_error (NULL,
2367 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2368 SCM_EOL);
c291b588 2369
2616f0e0 2370 /* The following is still wrong, of course...
3ee86942 2371 */
9c44cd45 2372 str = scm_i_string_start_writing (str);
3ee86942
MV
2373 chars = scm_i_string_writable_chars (str);
2374 scm_i_string_stop_writing ();
2375 return chars;
2376}
2377
2378size_t
fe78c51a 2379scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2380{
2381 scm_c_issue_deprecation_warning
2382 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2383 return scm_c_string_length (str);
2384}
2385
2386#endif
2387
2a610be5
AW
2388static SCM
2389string_handle_ref (scm_t_array_handle *h, size_t index)
2390{
2391 return scm_c_string_ref (h->array, index);
2392}
2393
2394static void
2395string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2396{
2397 scm_c_string_set_x (h->array, index, val);
2398}
2399
2400static void
2401string_get_handle (SCM v, scm_t_array_handle *h)
2402{
2403 h->array = v;
2404 h->ndims = 1;
2405 h->dims = &h->dim0;
2406 h->dim0.lbnd = 0;
2407 h->dim0.ubnd = scm_c_string_length (v) - 1;
2408 h->dim0.inc = 1;
2409 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2410 h->elements = h->writable_elements = NULL;
2411}
2412
c5f17102 2413SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2414 string_handle_ref, string_handle_set,
f65e0168
LC
2415 string_get_handle)
2416SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2417
0f2d19dd
JB
2418void
2419scm_init_strings ()
0f2d19dd 2420{
a7e392c1 2421 scm_nullstr = scm_i_make_string (0, NULL, 0);
7c33806a 2422
a0599745 2423#include "libguile/strings.x"
0f2d19dd
JB
2424}
2425
89e00824
ML
2426
2427/*
2428 Local Variables:
2429 c-file-style: "gnu"
2430 End:
2431*/