Add and use maybe_annotate_source helper in read.c
[bpt/guile.git] / libguile / strings.c
CommitLineData
a7e392c1 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
faf2c9d7 32
889975e5
MG
33#include "striconveh.h"
34
a0599745
MD
35#include "libguile/_scm.h"
36#include "libguile/chars.h"
7c33806a 37#include "libguile/root.h"
a0599745 38#include "libguile/strings.h"
a3d7d5d5 39#include "libguile/error.h"
f45eccff 40#include "libguile/generalized-vectors.h"
1afff620 41#include "libguile/deprecation.h"
a0599745 42#include "libguile/validate.h"
d31b9519 43#include "libguile/private-options.h"
1afff620 44
0f2d19dd
JB
45\f
46
47/* {Strings}
48 */
49
3ee86942
MV
50
51/* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 55 * shared, not whether it definitely is.
3ee86942
MV
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
50b1996f 68 *
ba54a202
LC
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
3ee86942
MV
72 */
73
ba54a202
LC
74/* The size in words of the stringbuf header (type tag + size). */
75#define STRINGBUF_HEADER_SIZE 2U
76
77#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
35920c00 79#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 80#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
81
82#define STRINGBUF_TAG scm_tc7_stringbuf
83#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 84#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 85
100e20c7 86#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
100e20c7
LC
89#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 91
100e20c7 92#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 93
9b41542f
LC
94#define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
3ee86942 103
56a3dcd4 104#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
105static size_t lenhist[1001];
106#endif
107
50b1996f
MG
108/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
3ee86942
MV
110static SCM
111make_stringbuf (size_t len)
0f2d19dd 112{
3ee86942
MV
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
ba54a202
LC
120 SCM buf;
121
56a3dcd4 122#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127#endif
0f2d19dd 128
ba54a202
LC
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
3ee86942 138}
e53cc817 139
50b1996f
MG
140/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
9c44cd45
MG
142static SCM
143make_wide_stringbuf (size_t len)
144{
ba54a202
LC
145 SCM buf;
146 size_t raw_len;
147
56a3dcd4 148#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153#endif
154
ba54a202
LC
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
9c44cd45
MG
165}
166
ba54a202
LC
167/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169static SCM
170wide_stringbuf (SCM buf)
9c44cd45 171{
ba54a202 172 SCM new_buf;
9c44cd45
MG
173
174 if (STRINGBUF_WIDE (buf))
ba54a202
LC
175 new_buf = buf;
176 else
9c44cd45 177 {
ba54a202
LC
178 size_t i, len;
179 scm_t_wchar *mem;
9c44cd45 180
ba54a202 181 len = STRINGBUF_LENGTH (buf);
9c44cd45 182
ba54a202 183 new_buf = make_wide_stringbuf (len);
9c44cd45 184
ba54a202 185 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 186 for (i = 0; i < len; i++)
ba54a202 187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 188 mem[len] = 0;
9c44cd45 189 }
ba54a202
LC
190
191 return new_buf;
3ee86942 192}
bd9e24b3 193
ba54a202
LC
194/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196static SCM
587a3355
MG
197narrow_stringbuf (SCM buf)
198{
ba54a202 199 SCM new_buf;
587a3355
MG
200
201 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
587a3355 208
ba54a202
LC
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 211
ba54a202
LC
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
587a3355 216
ba54a202 217 new_buf = make_stringbuf (len);
587a3355 218
ba54a202
LC
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
587a3355
MG
226}
227
9de87eea 228scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 229
ba54a202 230\f
3ee86942
MV
231/* Copy-on-write strings.
232 */
bd9e24b3 233
3ee86942 234#define STRING_TAG scm_tc7_string
bd9e24b3 235
3ee86942
MV
236#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 239
3ee86942
MV
240#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
ed35de72
MV
245/* Read-only strings.
246 */
247
35920c00 248#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
249#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
e1b29f6a
MV
251/* Mutation-sharing substrings
252 */
253
254#define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257/* START and LENGTH as for STRINGs. */
258
259#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
e7efe8e7
AW
261SCM scm_nullstr;
262
50b1996f
MG
263/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
3ee86942 267SCM
190d4b0d 268scm_i_make_string (size_t len, char **charsp, int read_only_p)
3ee86942 269{
69cd5299
MW
270 static SCM null_stringbuf = SCM_BOOL_F;
271 SCM buf;
3ee86942 272 SCM res;
69cd5299
MW
273
274 if (len == 0)
275 {
276 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
277 {
278 null_stringbuf = make_stringbuf (0);
279 SET_STRINGBUF_SHARED (null_stringbuf);
280 }
281 buf = null_stringbuf;
282 }
283 else
284 buf = make_stringbuf (len);
285
3ee86942 286 if (charsp)
f59cf998 287 *charsp = (char *) STRINGBUF_CHARS (buf);
190d4b0d
LC
288 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
289 SCM_UNPACK (buf),
290 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 291 return res;
0f2d19dd
JB
292}
293
50b1996f
MG
294/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
295 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
296 character array. If READ_ONLY_P, the returned string is read-only;
297 otherwise it is writable. */
9c44cd45 298SCM
190d4b0d 299scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
9c44cd45
MG
300{
301 SCM buf = make_wide_stringbuf (len);
302 SCM res;
303 if (charsp)
304 *charsp = STRINGBUF_WIDE_CHARS (buf);
190d4b0d
LC
305 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
306 SCM_UNPACK (buf),
9c44cd45
MG
307 (scm_t_bits) 0, (scm_t_bits) len);
308 return res;
309}
310
3ee86942
MV
311static void
312validate_substring_args (SCM str, size_t start, size_t end)
313{
314 if (!IS_STRING (str))
315 scm_wrong_type_arg_msg (NULL, 0, str, "string");
316 if (start > STRING_LENGTH (str))
317 scm_out_of_range (NULL, scm_from_size_t (start));
318 if (end > STRING_LENGTH (str) || end < start)
319 scm_out_of_range (NULL, scm_from_size_t (end));
320}
0f2d19dd 321
e1b29f6a
MV
322static inline void
323get_str_buf_start (SCM *str, SCM *buf, size_t *start)
324{
325 *start = STRING_START (*str);
326 if (IS_SH_STRING (*str))
327 {
328 *str = SH_STRING_STRING (*str);
329 *start += STRING_START (*str);
330 }
331 *buf = STRING_STRINGBUF (*str);
332}
333
3ee86942
MV
334SCM
335scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 336{
17bec545
MW
337 if (start == end)
338 return scm_i_make_string (0, NULL, 0);
339 else
340 {
341 SCM buf;
342 size_t str_start;
343 get_str_buf_start (&str, &buf, &str_start);
344 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
345 SET_STRINGBUF_SHARED (buf);
346 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
347 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
348 (scm_t_bits)str_start + start,
349 (scm_t_bits) end - start);
350 }
0f2d19dd
JB
351}
352
ed35de72
MV
353SCM
354scm_i_substring_read_only (SCM str, size_t start, size_t end)
355{
17bec545
MW
356 if (start == end)
357 return scm_i_make_string (0, NULL, 1);
358 else
359 {
360 SCM buf;
361 size_t str_start;
362 get_str_buf_start (&str, &buf, &str_start);
363 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
364 SET_STRINGBUF_SHARED (buf);
365 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
366 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
367 (scm_t_bits)str_start + start,
368 (scm_t_bits) end - start);
369 }
ed35de72
MV
370}
371
3ee86942
MV
372SCM
373scm_i_substring_copy (SCM str, size_t start, size_t end)
374{
375 size_t len = end - start;
d6a569c1 376 SCM buf, my_buf, substr;
e1b29f6a 377 size_t str_start;
d6a569c1 378 int wide = 0;
e1b29f6a 379 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
380 if (scm_i_is_narrow_string (str))
381 {
382 my_buf = make_stringbuf (len);
383 memcpy (STRINGBUF_CHARS (my_buf),
384 STRINGBUF_CHARS (buf) + str_start + start, len);
385 }
386 else
387 {
388 my_buf = make_wide_stringbuf (len);
389 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
390 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
391 + start), len);
d6a569c1 392 wide = 1;
9c44cd45 393 }
3ee86942 394 scm_remember_upto_here_1 (buf);
d6a569c1
MW
395 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
396 (scm_t_bits) 0, (scm_t_bits) len);
397 if (wide)
398 scm_i_try_narrow_string (substr);
399 return substr;
3ee86942 400}
0f2d19dd 401
e1b29f6a
MV
402SCM
403scm_i_substring_shared (SCM str, size_t start, size_t end)
404{
405 if (start == 0 && end == STRING_LENGTH (str))
406 return str;
17bec545
MW
407 else if (start == end)
408 return scm_i_make_string (0, NULL, 0);
409 else
e1b29f6a
MV
410 {
411 size_t len = end - start;
412 if (IS_SH_STRING (str))
413 {
414 start += STRING_START (str);
415 str = SH_STRING_STRING (str);
416 }
417 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
418 (scm_t_bits)start, (scm_t_bits) len);
419 }
420}
421
3ee86942
MV
422SCM
423scm_c_substring (SCM str, size_t start, size_t end)
424{
425 validate_substring_args (str, start, end);
426 return scm_i_substring (str, start, end);
427}
ee149d03 428
ed35de72
MV
429SCM
430scm_c_substring_read_only (SCM str, size_t start, size_t end)
431{
432 validate_substring_args (str, start, end);
433 return scm_i_substring_read_only (str, start, end);
434}
435
0f2d19dd 436SCM
3ee86942 437scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 438{
3ee86942
MV
439 validate_substring_args (str, start, end);
440 return scm_i_substring_copy (str, start, end);
441}
442
3ee86942
MV
443SCM
444scm_c_substring_shared (SCM str, size_t start, size_t end)
445{
446 validate_substring_args (str, start, end);
447 return scm_i_substring_shared (str, start, end);
448}
0f2d19dd 449
d6c74168 450\f
3ee86942
MV
451/* Internal accessors
452 */
453
50b1996f
MG
454/* Returns the number of characters in STR. This may be different
455 than the memory size of the string storage. */
3ee86942
MV
456size_t
457scm_i_string_length (SCM str)
0f2d19dd 458{
3ee86942 459 return STRING_LENGTH (str);
0f2d19dd
JB
460}
461
50b1996f
MG
462/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
463 encoding. False if it is 'wide', having a 32-bit UCS-4
464 encoding. */
9c44cd45
MG
465int
466scm_i_is_narrow_string (SCM str)
467{
49d09292
MW
468 if (IS_SH_STRING (str))
469 str = SH_STRING_STRING (str);
470
9c44cd45
MG
471 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
472}
473
587a3355
MG
474/* Try to coerce a string to be narrow. It if is narrow already, do
475 nothing. If it is wide, shrink it to narrow if none of its
476 characters are above 0xFF. Return true if the string is narrow or
477 was made to be narrow. */
478int
479scm_i_try_narrow_string (SCM str)
480{
49d09292
MW
481 if (IS_SH_STRING (str))
482 str = SH_STRING_STRING (str);
483
ba54a202 484 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
485
486 return scm_i_is_narrow_string (str);
487}
488
100e20c7
LC
489/* Return a pointer to the raw data of the string, which can be either Latin-1
490 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
491const void *
492scm_i_string_data (SCM str)
493{
494 SCM buf;
495 size_t start;
496 const char *data;
497
498 get_str_buf_start (&str, &buf, &start);
499
500 data = STRINGBUF_CONTENTS (buf);
501 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
502
503 return data;
504}
505
50b1996f
MG
506/* Returns a pointer to the 8-bit Latin-1 encoded character array of
507 STR. */
3ee86942
MV
508const char *
509scm_i_string_chars (SCM str)
510{
511 SCM buf;
e1b29f6a
MV
512 size_t start;
513 get_str_buf_start (&str, &buf, &start);
9c44cd45 514 if (scm_i_is_narrow_string (str))
f59cf998 515 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
516 else
517 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
518 scm_list_1 (str));
519 return NULL;
3ee86942 520}
b00418df 521
50b1996f
MG
522/* Returns a pointer to the 32-bit UCS-4 encoded character array of
523 STR. */
9c44cd45
MG
524const scm_t_wchar *
525scm_i_string_wide_chars (SCM str)
526{
527 SCM buf;
528 size_t start;
529
530 get_str_buf_start (&str, &buf, &start);
531 if (!scm_i_is_narrow_string (str))
f59cf998 532 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
533 else
534 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
535 scm_list_1 (str));
536}
537
538/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
539 a new string buffer, so that it can be modified without modifying
50b1996f
MG
540 other strings. Also, lock the string mutex. Later, one must call
541 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
542SCM
543scm_i_string_start_writing (SCM orig_str)
b00418df 544{
ed35de72 545 SCM buf, str = orig_str;
e1b29f6a 546 size_t start;
ed35de72 547
e1b29f6a 548 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
549 if (IS_RO_STRING (str))
550 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
551
9de87eea 552 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
553 if (STRINGBUF_SHARED (buf))
554 {
9c44cd45 555 /* Clone the stringbuf. */
3ee86942
MV
556 size_t len = STRING_LENGTH (str);
557 SCM new_buf;
558
9de87eea 559 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 560
9c44cd45
MG
561 if (scm_i_is_narrow_string (str))
562 {
563 new_buf = make_stringbuf (len);
564 memcpy (STRINGBUF_CHARS (new_buf),
565 STRINGBUF_CHARS (buf) + STRING_START (str), len);
566
567 }
568 else
569 {
570 new_buf = make_wide_stringbuf (len);
571 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
572 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
573 + STRING_START (str)), len);
574 }
3ee86942 575
3ee86942
MV
576 SET_STRING_STRINGBUF (str, new_buf);
577 start -= STRING_START (str);
902578f1
LC
578
579 /* FIXME: The following operations are not atomic, so other threads
580 looking at STR may see an inconsistent state. Nevertheless it can't
581 hurt much since (i) accessing STR while it is being mutated can't
582 yield a crash, and (ii) concurrent accesses to STR should be
583 protected by a mutex at the application level. The latter may not
584 apply when STR != ORIG_STR, though. */
3ee86942 585 SET_STRING_START (str, 0);
902578f1 586 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
587
588 buf = new_buf;
589
9de87eea 590 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 591 }
9c44cd45
MG
592 return orig_str;
593}
3ee86942 594
50b1996f 595/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
596char *
597scm_i_string_writable_chars (SCM str)
598{
599 SCM buf;
600 size_t start;
3ee86942 601
9c44cd45
MG
602 get_str_buf_start (&str, &buf, &start);
603 if (scm_i_is_narrow_string (str))
f59cf998 604 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
605 else
606 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
607 scm_list_1 (str));
608 return NULL;
b00418df
DH
609}
610
50b1996f 611/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
612static scm_t_wchar *
613scm_i_string_writable_wide_chars (SCM str)
614{
615 SCM buf;
616 size_t start;
617
618 get_str_buf_start (&str, &buf, &start);
619 if (!scm_i_is_narrow_string (str))
620 return STRINGBUF_WIDE_CHARS (buf) + start;
621 else
1c7b216f 622 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 623 scm_list_1 (str));
b00418df
DH
624}
625
50b1996f
MG
626/* Unlock the string mutex that was locked when
627 scm_i_string_start_writing was called. */
3ee86942
MV
628void
629scm_i_string_stop_writing (void)
630{
9de87eea 631 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 632}
b00418df 633
50b1996f 634/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
635scm_t_wchar
636scm_i_string_ref (SCM str, size_t x)
637{
638 if (scm_i_is_narrow_string (str))
639 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
640 else
641 return scm_i_string_wide_chars (str)[x];
642}
643
889975e5
MG
644/* Returns index+1 of the first char in STR that matches C, or
645 0 if the char is not found. */
646int
647scm_i_string_contains_char (SCM str, char ch)
648{
649 size_t i;
650 size_t len = scm_i_string_length (str);
651
652 i = 0;
653 if (scm_i_is_narrow_string (str))
654 {
655 while (i < len)
656 {
657 if (scm_i_string_chars (str)[i] == ch)
658 return i+1;
659 i++;
660 }
661 }
662 else
663 {
664 while (i < len)
665 {
666 if (scm_i_string_wide_chars (str)[i]
667 == (unsigned char) ch)
668 return i+1;
669 i++;
670 }
671 }
672 return 0;
673}
674
3f47e526
MG
675int
676scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
677{
678 if (scm_i_is_narrow_string (sstr))
679 {
680 const char *a = scm_i_string_chars (sstr) + start_x;
681 const char *b = cstr;
682 return strncmp (a, b, strlen(b));
683 }
684 else
685 {
686 size_t i;
687 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
688 const char *b = cstr;
689 for (i = 0; i < strlen (b); i++)
690 {
691 if (a[i] != (unsigned char) b[i])
692 return 1;
693 }
694 }
695 return 0;
696}
697
50b1996f 698/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
699void
700scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
701{
49d09292
MW
702 if (IS_SH_STRING (str))
703 {
704 p += STRING_START (str);
705 str = SH_STRING_STRING (str);
706 }
707
9c44cd45 708 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 709 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
710
711 if (scm_i_is_narrow_string (str))
712 {
713 char *dst = scm_i_string_writable_chars (str);
587a3355 714 dst[p] = chr;
9c44cd45
MG
715 }
716 else
717 {
718 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
719 dst[p] = chr;
720 }
721}
722
ba54a202 723\f
3ee86942 724/* Symbols.
587a3355 725
3ee86942
MV
726 Basic symbol creation and accessing is done here, the rest is in
727 symbols.[hc]. This has been done to keep stringbufs and the
728 internals of strings and string-like objects confined to this file.
729*/
730
731#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
732
733SCM
6869328b
MV
734scm_i_make_symbol (SCM name, scm_t_bits flags,
735 unsigned long hash, SCM props)
3ee86942
MV
736{
737 SCM buf;
738 size_t start = STRING_START (name);
739 size_t length = STRING_LENGTH (name);
740
741 if (IS_SH_STRING (name))
742 {
743 name = SH_STRING_STRING (name);
744 start += STRING_START (name);
745 }
746 buf = SYMBOL_STRINGBUF (name);
747
748 if (start == 0 && length == STRINGBUF_LENGTH (buf))
749 {
750 /* reuse buf. */
9de87eea 751 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 752 SET_STRINGBUF_SHARED (buf);
9de87eea 753 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
754 }
755 else
756 {
757 /* make new buf. */
9c44cd45
MG
758 if (scm_i_is_narrow_string (name))
759 {
760 SCM new_buf = make_stringbuf (length);
761 memcpy (STRINGBUF_CHARS (new_buf),
762 STRINGBUF_CHARS (buf) + start, length);
763 buf = new_buf;
764 }
765 else
766 {
767 SCM new_buf = make_wide_stringbuf (length);
768 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
769 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
770 length);
771 buf = new_buf;
772 }
3ee86942 773 }
6869328b 774 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
775 (scm_t_bits) hash, SCM_UNPACK (props));
776}
777
fd0a5bbc
HWN
778SCM
779scm_i_c_make_symbol (const char *name, size_t len,
780 scm_t_bits flags, unsigned long hash, SCM props)
781{
782 SCM buf = make_stringbuf (len);
783 memcpy (STRINGBUF_CHARS (buf), name, len);
784
65619ebe
AW
785 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
786 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
787}
788
50b1996f
MG
789/* Returns the number of characters in SYM. This may be different
790 from the memory size of SYM. */
3ee86942
MV
791size_t
792scm_i_symbol_length (SCM sym)
0f2d19dd 793{
3ee86942 794 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
795}
796
071bb6a8
LC
797size_t
798scm_c_symbol_length (SCM sym)
799#define FUNC_NAME "scm_c_symbol_length"
800{
801 SCM_VALIDATE_SYMBOL (1, sym);
802
803 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
804}
805#undef FUNC_NAME
806
50b1996f
MG
807/* True if the name of SYM is stored as a Latin-1 encoded string.
808 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
809int
810scm_i_is_narrow_symbol (SCM sym)
811{
812 SCM buf;
813
814 buf = SYMBOL_STRINGBUF (sym);
815 return !STRINGBUF_WIDE (buf);
816}
817
50b1996f
MG
818/* Returns a pointer to the 8-bit Latin-1 encoded character array that
819 contains the name of SYM. */
3ee86942
MV
820const char *
821scm_i_symbol_chars (SCM sym)
822{
9c44cd45
MG
823 SCM buf;
824
825 buf = SYMBOL_STRINGBUF (sym);
826 if (!STRINGBUF_WIDE (buf))
f59cf998 827 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
828 else
829 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
830 scm_list_1 (sym));
831}
832
50b1996f
MG
833/* Return a pointer to the 32-bit UCS-4-encoded character array of a
834 symbol's name. */
9c44cd45
MG
835const scm_t_wchar *
836scm_i_symbol_wide_chars (SCM sym)
837{
838 SCM buf;
839
840 buf = SYMBOL_STRINGBUF (sym);
841 if (STRINGBUF_WIDE (buf))
f59cf998 842 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
843 else
844 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
845 scm_list_1 (sym));
3ee86942 846}
1cc91f1b 847
be54b15d 848SCM
3ee86942 849scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 850{
3ee86942 851 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 852 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 853 SET_STRINGBUF_SHARED (buf);
9de87eea 854 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 855 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
856 (scm_t_bits)start, (scm_t_bits) end - start);
857}
be54b15d 858
50b1996f 859/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
860scm_t_wchar
861scm_i_symbol_ref (SCM sym, size_t x)
862{
863 if (scm_i_is_narrow_symbol (sym))
864 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
865 else
866 return scm_i_symbol_wide_chars (sym)[x];
867}
868
3ee86942
MV
869/* Debugging
870 */
be54b15d 871
6ce6923b
MG
872SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
873 "Returns an association list containing debugging information\n"
874 "for @var{str}. The association list has the following entries."
875 "@table @code\n"
876 "@item string\n"
877 "The string itself.\n"
878 "@item start\n"
879 "The start index of the string into its stringbuf\n"
880 "@item length\n"
881 "The length of the string\n"
882 "@item shared\n"
883 "If this string is a substring, it returns its parent string.\n"
884 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
885 "@item read-only\n"
886 "@code{#t} if the string is read-only\n"
6ce6923b
MG
887 "@item stringbuf-chars\n"
888 "A new string containing this string's stringbuf's characters\n"
889 "@item stringbuf-length\n"
890 "The number of characters in this stringbuf\n"
891 "@item stringbuf-shared\n"
892 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
893 "@item stringbuf-wide\n"
894 "@code{#t} if this stringbuf's characters are stored in a\n"
895 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
896 "buffer\n"
897 "@end table")
3ee86942
MV
898#define FUNC_NAME s_scm_sys_string_dump
899{
ba54a202 900 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 901 SCM buf;
3ee86942 902 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
903
904 /* String info */
4a655e50 905 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 906 str);
4a655e50 907 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 908 scm_from_size_t (STRING_START (str)));
4a655e50 909 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
910 scm_from_size_t (STRING_LENGTH (str)));
911
3ee86942
MV
912 if (IS_SH_STRING (str))
913 {
4a655e50 914 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
915 SH_STRING_STRING (str));
916 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
917 }
918 else
919 {
4a655e50 920 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
921 SCM_BOOL_F);
922 buf = STRING_STRINGBUF (str);
3ee86942 923 }
9c44cd45 924
88ed5759 925 if (IS_RO_STRING (str))
4a655e50 926 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
927 SCM_BOOL_T);
928 else
4a655e50 929 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 930 SCM_BOOL_F);
587a3355 931
6ce6923b 932 /* Stringbuf info */
6ce6923b
MG
933 if (!STRINGBUF_WIDE (buf))
934 {
935 size_t len = STRINGBUF_LENGTH (buf);
936 char *cbuf;
190d4b0d 937 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 938 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 939 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 940 sbc);
3ee86942 941 }
6ce6923b
MG
942 else
943 {
944 size_t len = STRINGBUF_LENGTH (buf);
945 scm_t_wchar *cbuf;
190d4b0d 946 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
947 u32_cpy ((scm_t_uint32 *) cbuf,
948 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 949 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
950 sbc);
951 }
4a655e50 952 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
953 scm_from_size_t (STRINGBUF_LENGTH (buf)));
954 if (STRINGBUF_SHARED (buf))
4a655e50 955 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
956 SCM_BOOL_T);
957 else
4a655e50 958 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 959 SCM_BOOL_F);
6ce6923b 960 if (STRINGBUF_WIDE (buf))
4a655e50 961 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 962 SCM_BOOL_T);
6ce6923b 963 else
4a655e50 964 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 965 SCM_BOOL_F);
6ce6923b 966
ba54a202 967 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
968}
969#undef FUNC_NAME
970
6ce6923b
MG
971SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
972 "Returns an association list containing debugging information\n"
973 "for @var{sym}. The association list has the following entries."
974 "@table @code\n"
975 "@item symbol\n"
976 "The symbol itself\n"
977 "@item hash\n"
978 "Its hash value\n"
88ed5759
MG
979 "@item interned\n"
980 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
981 "@item stringbuf-chars\n"
982 "A new string containing this symbols's stringbuf's characters\n"
983 "@item stringbuf-length\n"
984 "The number of characters in this stringbuf\n"
985 "@item stringbuf-shared\n"
986 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
987 "@item stringbuf-wide\n"
988 "@code{#t} if this stringbuf's characters are stored in a\n"
989 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
990 "buffer\n"
991 "@end table")
3ee86942
MV
992#define FUNC_NAME s_scm_sys_symbol_dump
993{
ba54a202 994 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 995 SCM buf;
3ee86942 996 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 997 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 998 sym);
4a655e50 999 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 1000 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 1001 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 1002 scm_symbol_interned_p (sym));
6ce6923b
MG
1003 buf = SYMBOL_STRINGBUF (sym);
1004
1005 /* Stringbuf info */
6ce6923b
MG
1006 if (!STRINGBUF_WIDE (buf))
1007 {
1008 size_t len = STRINGBUF_LENGTH (buf);
1009 char *cbuf;
190d4b0d 1010 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 1011 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 1012 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1013 sbc);
1014 }
9c44cd45 1015 else
6ce6923b
MG
1016 {
1017 size_t len = STRINGBUF_LENGTH (buf);
1018 scm_t_wchar *cbuf;
190d4b0d 1019 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
1020 u32_cpy ((scm_t_uint32 *) cbuf,
1021 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 1022 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1023 sbc);
1024 }
4a655e50 1025 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
1026 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1027 if (STRINGBUF_SHARED (buf))
4a655e50 1028 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
1029 SCM_BOOL_T);
1030 else
4a655e50 1031 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 1032 SCM_BOOL_F);
6ce6923b 1033 if (STRINGBUF_WIDE (buf))
4a655e50 1034 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
1035 SCM_BOOL_T);
1036 else
4a655e50 1037 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 1038 SCM_BOOL_F);
ba54a202 1039 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 1040
3ee86942
MV
1041}
1042#undef FUNC_NAME
1043
56a3dcd4 1044#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1045
9c44cd45 1046SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1047#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1048{
1049 int i;
1050 for (i = 0; i < 1000; i++)
1051 if (lenhist[i])
1052 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1053 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1054 return SCM_UNSPECIFIED;
be54b15d
DH
1055}
1056#undef FUNC_NAME
1057
3ee86942
MV
1058#endif
1059
1060\f
1061
1062SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1063 (SCM obj),
1064 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1065#define FUNC_NAME s_scm_string_p
1066{
1067 return scm_from_bool (IS_STRING (obj));
1068}
1069#undef FUNC_NAME
1070
1071
1072SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1073
1074SCM_DEFINE (scm_string, "string", 0, 0, 1,
1075 (SCM chrs),
1076 "@deffnx {Scheme Procedure} list->string chrs\n"
1077 "Return a newly allocated string composed of the arguments,\n"
1078 "@var{chrs}.")
1079#define FUNC_NAME s_scm_string
1080{
9aa27c1a 1081 SCM result = SCM_BOOL_F;
9c44cd45 1082 SCM rest;
3ee86942 1083 size_t len;
9c44cd45
MG
1084 size_t p = 0;
1085 long i;
9aa27c1a 1086 int wide = 0;
3ee86942 1087
9c44cd45
MG
1088 /* Verify that this is a list of chars. */
1089 i = scm_ilength (chrs);
3c7cf7f5 1090 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1091
9c44cd45
MG
1092 len = (size_t) i;
1093 rest = chrs;
3ee86942 1094
9c44cd45 1095 while (len > 0 && scm_is_pair (rest))
3ee86942 1096 {
9c44cd45 1097 SCM elt = SCM_CAR (rest);
3ee86942 1098 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1099 if (SCM_CHAR (elt) > 0xFF)
1100 wide = 1;
9c44cd45
MG
1101 rest = SCM_CDR (rest);
1102 len--;
1103 scm_remember_upto_here_1 (elt);
1104 }
1105
1106 /* Construct a string containing this list of chars. */
1107 len = (size_t) i;
1108 rest = chrs;
1109
9aa27c1a 1110 if (wide == 0)
9c44cd45 1111 {
56a3dcd4
LC
1112 char *buf;
1113
190d4b0d 1114 result = scm_i_make_string (len, NULL, 0);
9aa27c1a 1115 result = scm_i_string_start_writing (result);
56a3dcd4 1116 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1117 while (len > 0 && scm_is_pair (rest))
1118 {
1119 SCM elt = SCM_CAR (rest);
1120 buf[p] = (unsigned char) SCM_CHAR (elt);
1121 p++;
1122 rest = SCM_CDR (rest);
1123 len--;
1124 scm_remember_upto_here_1 (elt);
1125 }
1126 }
1127 else
1128 {
56a3dcd4
LC
1129 scm_t_wchar *buf;
1130
190d4b0d 1131 result = scm_i_make_wide_string (len, NULL, 0);
9aa27c1a 1132 result = scm_i_string_start_writing (result);
56a3dcd4 1133 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1134 while (len > 0 && scm_is_pair (rest))
1135 {
1136 SCM elt = SCM_CAR (rest);
1137 buf[p] = SCM_CHAR (elt);
1138 p++;
1139 rest = SCM_CDR (rest);
1140 len--;
1141 scm_remember_upto_here_1 (elt);
1142 }
3ee86942 1143 }
9c44cd45
MG
1144 scm_i_string_stop_writing ();
1145
3ee86942
MV
1146 if (len > 0)
1147 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1148 if (!scm_is_null (rest))
3ee86942
MV
1149 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1150
1151 return result;
1152}
1153#undef FUNC_NAME
be54b15d 1154
3b3b36dd 1155SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1156 (SCM k, SCM chr),
0d26a824
MG
1157 "Return a newly allocated string of\n"
1158 "length @var{k}. If @var{chr} is given, then all elements of\n"
1159 "the string are initialized to @var{chr}, otherwise the contents\n"
b7e64f8b 1160 "of the string are all set to @code{#\nul}.")
1bbd0b84 1161#define FUNC_NAME s_scm_make_string
0f2d19dd 1162{
3ee86942
MV
1163 return scm_c_make_string (scm_to_size_t (k), chr);
1164}
1165#undef FUNC_NAME
1166
1167SCM
1168scm_c_make_string (size_t len, SCM chr)
1169#define FUNC_NAME NULL
1170{
9c44cd45 1171 size_t p;
3ef6650d 1172 char *contents = NULL;
190d4b0d 1173 SCM res = scm_i_make_string (len, &contents, 0);
cb0d8be2 1174
3ef6650d
AW
1175 /* If no char is given, initialize string contents to NULL. */
1176 if (SCM_UNBNDP (chr))
1177 memset (contents, 0, len);
1178 else
e11e83f3 1179 {
3ee86942 1180 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1181 res = scm_i_string_start_writing (res);
1182 for (p = 0; p < len; p++)
1183 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1184 scm_i_string_stop_writing ();
0f2d19dd 1185 }
e11e83f3
MV
1186
1187 return res;
0f2d19dd 1188}
1bbd0b84 1189#undef FUNC_NAME
0f2d19dd 1190
3b3b36dd 1191SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1192 (SCM string),
1193 "Return the number of characters in @var{string}.")
1bbd0b84 1194#define FUNC_NAME s_scm_string_length
0f2d19dd 1195{
d1ca2c64 1196 SCM_VALIDATE_STRING (1, string);
3ee86942 1197 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1198}
1bbd0b84 1199#undef FUNC_NAME
0f2d19dd 1200
f8ba2bb9 1201SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1202 (SCM string),
1203 "Return the bytes used to represent a character in @var{string}."
1204 "This will return 1 or 4.")
f8ba2bb9 1205#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1206{
1207 SCM_VALIDATE_STRING (1, string);
1208 if (!scm_i_is_narrow_string (string))
1209 return scm_from_int (4);
1210
1211 return scm_from_int (1);
1212}
1213#undef FUNC_NAME
1214
3ee86942
MV
1215size_t
1216scm_c_string_length (SCM string)
1217{
1218 if (!IS_STRING (string))
1219 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1220 return STRING_LENGTH (string);
1221}
1222
bd9e24b3 1223SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1224 (SCM str, SCM k),
9c44cd45
MG
1225 "Return character @var{k} of @var{str} using zero-origin\n"
1226 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1227#define FUNC_NAME s_scm_string_ref
0f2d19dd 1228{
3ae3166b 1229 size_t len;
a55c2b68 1230 unsigned long idx;
bd9e24b3 1231
d1ca2c64 1232 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1233
1234 len = scm_i_string_length (str);
1235 if (SCM_LIKELY (len > 0))
1236 idx = scm_to_unsigned_integer (k, 0, len - 1);
1237 else
1238 scm_out_of_range (NULL, k);
1239
9c44cd45
MG
1240 if (scm_i_is_narrow_string (str))
1241 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1242 else
1243 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1244}
1bbd0b84 1245#undef FUNC_NAME
0f2d19dd 1246
3ee86942
MV
1247SCM
1248scm_c_string_ref (SCM str, size_t p)
1249{
1250 if (p >= scm_i_string_length (str))
1251 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1252 if (scm_i_is_narrow_string (str))
1253 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1254 else
1255 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1256
3ee86942 1257}
f0942910 1258
3b3b36dd 1259SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1260 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1261 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1262 "an unspecified value. @var{k} must be a valid index of\n"
1263 "@var{str}.")
1bbd0b84 1264#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1265{
3ae3166b 1266 size_t len;
a55c2b68
MV
1267 unsigned long idx;
1268
f0942910 1269 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1270
1271 len = scm_i_string_length (str);
1272 if (SCM_LIKELY (len > 0))
1273 idx = scm_to_unsigned_integer (k, 0, len - 1);
1274 else
1275 scm_out_of_range (NULL, k);
1276
34d19ef6 1277 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1278 str = scm_i_string_start_writing (str);
1279 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1280 scm_i_string_stop_writing ();
1281
0f2d19dd
JB
1282 return SCM_UNSPECIFIED;
1283}
1bbd0b84 1284#undef FUNC_NAME
0f2d19dd 1285
3ee86942
MV
1286void
1287scm_c_string_set_x (SCM str, size_t p, SCM chr)
1288{
1289 if (p >= scm_i_string_length (str))
1290 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1291 str = scm_i_string_start_writing (str);
1292 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1293 scm_i_string_stop_writing ();
3ee86942 1294}
0f2d19dd 1295
3b3b36dd 1296SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1297 (SCM str, SCM start, SCM end),
1298 "Return a newly allocated string formed from the characters\n"
1299 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1300 "ending with index @var{end} (exclusive).\n"
1301 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1302 "exact integers satisfying:\n\n"
1303 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1304#define FUNC_NAME s_scm_substring
0f2d19dd 1305{
3ee86942 1306 size_t len, from, to;
685c0d71 1307
d1ca2c64 1308 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1309 len = scm_i_string_length (str);
1310 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1311 if (SCM_UNBNDP (end))
3ee86942 1312 to = len;
a55c2b68 1313 else
3ee86942
MV
1314 to = scm_to_unsigned_integer (end, from, len);
1315 return scm_i_substring (str, from, to);
0f2d19dd 1316}
1bbd0b84 1317#undef FUNC_NAME
0f2d19dd 1318
ed35de72
MV
1319SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1320 (SCM str, SCM start, SCM end),
1321 "Return a newly allocated string formed from the characters\n"
1322 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323 "ending with index @var{end} (exclusive).\n"
1324 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325 "exact integers satisfying:\n"
1326 "\n"
1327 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1328 "\n"
1329 "The returned string is read-only.\n")
1330#define FUNC_NAME s_scm_substring_read_only
1331{
1332 size_t len, from, to;
1333
1334 SCM_VALIDATE_STRING (1, str);
1335 len = scm_i_string_length (str);
1336 from = scm_to_unsigned_integer (start, 0, len);
1337 if (SCM_UNBNDP (end))
1338 to = len;
1339 else
1340 to = scm_to_unsigned_integer (end, from, len);
1341 return scm_i_substring_read_only (str, from, to);
1342}
1343#undef FUNC_NAME
1344
3ee86942
MV
1345SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1346 (SCM str, SCM start, SCM end),
1347 "Return a newly allocated string formed from the characters\n"
1348 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1349 "ending with index @var{end} (exclusive).\n"
1350 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1351 "exact integers satisfying:\n\n"
1352 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1353#define FUNC_NAME s_scm_substring_copy
1354{
e1b29f6a
MV
1355 /* For the Scheme version, START is mandatory, but for the C
1356 version, it is optional. See scm_string_copy in srfi-13.c for a
1357 rationale.
1358 */
1359
1360 size_t from, to;
3ee86942
MV
1361
1362 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1363 scm_i_get_substring_spec (scm_i_string_length (str),
1364 start, &from, end, &to);
3ee86942
MV
1365 return scm_i_substring_copy (str, from, to);
1366}
1367#undef FUNC_NAME
1368
1369SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1370 (SCM str, SCM start, SCM end),
1371 "Return string that indirectly refers to the characters\n"
1372 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1373 "ending with index @var{end} (exclusive).\n"
1374 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1375 "exact integers satisfying:\n\n"
1376 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1377#define FUNC_NAME s_scm_substring_shared
1378{
1379 size_t len, from, to;
1380
1381 SCM_VALIDATE_STRING (1, str);
1382 len = scm_i_string_length (str);
1383 from = scm_to_unsigned_integer (start, 0, len);
1384 if (SCM_UNBNDP (end))
1385 to = len;
1386 else
1387 to = scm_to_unsigned_integer (end, from, len);
1388 return scm_i_substring_shared (str, from, to);
1389}
1390#undef FUNC_NAME
685c0d71 1391
3b3b36dd 1392SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1393 (SCM args),
9c44cd45 1394 "Return a newly allocated string whose characters form the\n"
0d26a824 1395 "concatenation of the given strings, @var{args}.")
1bbd0b84 1396#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1397{
1398 SCM res;
9c44cd45
MG
1399 size_t len = 0;
1400 int wide = 0;
c829a427 1401 SCM l, s;
bd4911ef 1402 size_t i;
9909c395
MG
1403 union
1404 {
1405 char *narrow;
1406 scm_t_wchar *wide;
1407 } data;
af45e3b0
DH
1408
1409 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1410 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1411 {
1412 s = SCM_CAR (l);
1413 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1414 len += scm_i_string_length (s);
1415 if (!scm_i_is_narrow_string (s))
1416 wide = 1;
c829a427 1417 }
9909c395 1418 data.narrow = NULL;
9c44cd45 1419 if (!wide)
190d4b0d 1420 res = scm_i_make_string (len, &data.narrow, 0);
9c44cd45 1421 else
190d4b0d 1422 res = scm_i_make_wide_string (len, &data.wide, 0);
9c44cd45
MG
1423
1424 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1425 {
edea856c 1426 size_t len;
c829a427 1427 s = SCM_CAR (l);
3ee86942 1428 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1429 len = scm_i_string_length (s);
9c44cd45
MG
1430 if (!wide)
1431 {
9909c395
MG
1432 memcpy (data.narrow, scm_i_string_chars (s), len);
1433 data.narrow += len;
9c44cd45
MG
1434 }
1435 else
1436 {
1437 if (scm_i_is_narrow_string (s))
1438 {
1439 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1440 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1441 }
1442 else
9909c395 1443 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1444 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1445 data.wide += len;
9c44cd45 1446 }
c829a427
MV
1447 scm_remember_upto_here_1 (s);
1448 }
0f2d19dd
JB
1449 return res;
1450}
1bbd0b84 1451#undef FUNC_NAME
0f2d19dd 1452
24933780 1453
a3d7d5d5 1454\f
c62da8f8 1455/* Charset conversion error handling. */
a3d7d5d5
LC
1456
1457SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
c62da8f8
LC
1458SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1459
6851d3be
LC
1460/* Raise an exception informing that character CHR could not be written
1461 to PORT in its current encoding. */
d14418a5 1462void
ef7e4ba3 1463scm_encoding_error (const char *subr, int err, const char *message,
6851d3be 1464 SCM port, SCM chr)
ef7e4ba3 1465{
ef7e4ba3 1466 scm_throw (scm_encoding_error_key,
d050ef66
AW
1467 scm_list_n (scm_from_latin1_string (subr),
1468 scm_from_latin1_string (message),
ef7e4ba3 1469 scm_from_int (err),
6851d3be 1470 port, chr,
ef7e4ba3 1471 SCM_UNDEFINED));
a3d7d5d5
LC
1472}
1473
c62da8f8
LC
1474/* Raise an exception informing of an encoding error on PORT. This
1475 means that a character could not be written in PORT's encoding. */
1476void
1477scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1478{
1479 scm_throw (scm_decoding_error_key,
d050ef66
AW
1480 scm_list_n (scm_from_latin1_string (subr),
1481 scm_from_latin1_string (message),
c62da8f8
LC
1482 scm_from_int (err),
1483 port,
1484 SCM_UNDEFINED));
1485}
1486
1487\f
1488/* String conversion to/from C. */
1489
fac32b51 1490SCM
587a3355
MG
1491scm_from_stringn (const char *str, size_t len, const char *encoding,
1492 scm_t_string_failed_conversion_handler handler)
1493{
1494 size_t u32len, i;
1495 scm_t_wchar *u32;
1496 int wide = 0;
1497 SCM res;
1498
d40e1ca8 1499 /* The order of these checks is important. */
a574564c 1500 if (!str && len != 0)
d40e1ca8
AW
1501 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1502 if (len == (size_t) -1)
1503 len = strlen (str);
fac32b51 1504
75321259 1505 if (encoding == NULL || len == 0)
889975e5 1506 {
75321259 1507 /* If encoding is null (or the string is empty), use Latin-1. */
889975e5 1508 char *buf;
190d4b0d 1509 res = scm_i_make_string (len, &buf, 0);
889975e5
MG
1510 memcpy (buf, str, len);
1511 return res;
1512 }
1513
587a3355
MG
1514 u32len = 0;
1515 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1516 (enum iconv_ilseq_handler)
1517 handler,
1518 str, len,
1519 NULL,
1520 NULL, &u32len);
1521
ef7e4ba3 1522 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1523 {
ef7e4ba3
LC
1524 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1525 handler. */
1526 SCM bv;
1527 signed char *buf;
1528
1529 buf = scm_gc_malloc_pointerless (len, "bytevector");
1530 memcpy (buf, str, len);
fb031aba 1531 bv = scm_c_take_gc_bytevector (buf, len);
ef7e4ba3 1532
c62da8f8
LC
1533 scm_decoding_error (__func__, errno,
1534 "input locale conversion error", bv);
587a3355
MG
1535 }
1536
1537 i = 0;
1538 while (i < u32len)
1539 if (u32[i++] > 0xFF)
1540 {
1541 wide = 1;
1542 break;
1543 }
1544
1545 if (!wide)
1546 {
1547 char *dst;
190d4b0d 1548 res = scm_i_make_string (u32len, &dst, 0);
587a3355
MG
1549 for (i = 0; i < u32len; i ++)
1550 dst[i] = (unsigned char) u32[i];
1551 dst[u32len] = '\0';
1552 }
1553 else
1554 {
1555 scm_t_wchar *wdst;
190d4b0d 1556 res = scm_i_make_wide_string (u32len, &wdst, 0);
587a3355
MG
1557 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1558 wdst[u32len] = 0;
1559 }
1560
1561 free (u32);
1562 return res;
1563}
1564
cf313a94 1565SCM
d40e1ca8 1566scm_from_locale_string (const char *str)
cf313a94 1567{
d40e1ca8 1568 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1569}
1570
c829a427
MV
1571SCM
1572scm_from_locale_stringn (const char *str, size_t len)
1573{
95f5e303
AW
1574 return scm_from_stringn (str, len, locale_charset (),
1575 scm_i_get_conversion_strategy (SCM_BOOL_F));
c829a427 1576}
4d4528e7 1577
c829a427 1578SCM
d40e1ca8 1579scm_from_latin1_string (const char *str)
4d4528e7 1580{
d40e1ca8
AW
1581 return scm_from_latin1_stringn (str, -1);
1582}
9c44cd45 1583
d40e1ca8
AW
1584SCM
1585scm_from_latin1_stringn (const char *str, size_t len)
1586{
e9a35a96
LC
1587 char *buf;
1588 SCM result;
1589
1590 if (len == (size_t) -1)
1591 len = strlen (str);
1592
1593 /* Make a narrow string and copy STR as is. */
190d4b0d 1594 result = scm_i_make_string (len, &buf, 0);
e9a35a96
LC
1595 memcpy (buf, str, len);
1596
1597 return result;
c829a427 1598}
4d4528e7 1599
587a3355 1600SCM
d40e1ca8 1601scm_from_utf8_string (const char *str)
587a3355 1602{
d40e1ca8
AW
1603 return scm_from_utf8_stringn (str, -1);
1604}
1605
1606SCM
1607scm_from_utf8_stringn (const char *str, size_t len)
1608{
1609 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
587a3355
MG
1610}
1611
647dc1ac
LC
1612SCM
1613scm_from_utf32_string (const scm_t_wchar *str)
1614{
1615 return scm_from_utf32_stringn (str, -1);
1616}
1617
1618SCM
1619scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1620{
1621 SCM result;
1622 scm_t_wchar *buf;
1623
1624 if (len == (size_t) -1)
1625 len = u32_strlen ((uint32_t *) str);
1626
190d4b0d 1627 result = scm_i_make_wide_string (len, &buf, 0);
647dc1ac
LC
1628 memcpy (buf, str, len * sizeof (scm_t_wchar));
1629 scm_i_try_narrow_string (result);
1630
1631 return result;
1632}
1633
50b1996f
MG
1634/* Create a new scheme string from the C string STR. The memory of
1635 STR may be used directly as storage for the new string. */
13a94556
LC
1636/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1637 would be to register a finalizer to eventually free(3) STR, which isn't
1638 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1639SCM
1640scm_take_locale_stringn (char *str, size_t len)
1641{
13a94556 1642 SCM res;
48ddf0d9 1643
13a94556
LC
1644 res = scm_from_locale_stringn (str, len);
1645 free (str);
c829a427 1646
c829a427
MV
1647 return res;
1648}
1649
48ddf0d9
KR
1650SCM
1651scm_take_locale_string (char *str)
1652{
1653 return scm_take_locale_stringn (str, -1);
1654}
1655
f1ee6d54
LC
1656/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1657 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1658 Set *LENP to the size of the resulting string.
1659
1660 FIXME: This is a hack we should get rid of. See
1661 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1662 for details. */
1663static void
1664unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1665{
1666 char *before, *after;
1667 size_t i, j;
1668
4ff2b9f4
LC
1669 before = buf;
1670 after = buf;
9c44cd45
MG
1671 i = 0;
1672 j = 0;
1673 while (i < *lenp)
1674 {
1675 if ((i <= *lenp - 6)
1676 && before[i] == '\\'
1677 && before[i + 1] == 'u'
1678 && before[i + 2] == '0' && before[i + 3] == '0')
1679 {
1680 /* Convert \u00NN to \xNN */
1681 after[j] = '\\';
1682 after[j + 1] = 'x';
30a6b9ca
MG
1683 after[j + 2] = tolower ((int) before[i + 4]);
1684 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1685 i += 6;
1686 j += 4;
1687 }
1688 else if ((i <= *lenp - 10)
1689 && before[i] == '\\'
1690 && before[i + 1] == 'U'
1691 && before[i + 2] == '0' && before[i + 3] == '0')
1692 {
1693 /* Convert \U00NNNNNN to \UNNNNNN */
1694 after[j] = '\\';
1695 after[j + 1] = 'U';
30a6b9ca
MG
1696 after[j + 2] = tolower ((int) before[i + 4]);
1697 after[j + 3] = tolower ((int) before[i + 5]);
1698 after[j + 4] = tolower ((int) before[i + 6]);
1699 after[j + 5] = tolower ((int) before[i + 7]);
1700 after[j + 6] = tolower ((int) before[i + 8]);
1701 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1702 i += 10;
1703 j += 8;
1704 }
1705 else
1706 {
1707 after[j] = before[i];
1708 i++;
1709 j++;
1710 }
1711 }
1712 *lenp = j;
9c44cd45
MG
1713}
1714
f1ee6d54
LC
1715/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1716 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1717 of the resulting string. BUF must be large enough to handle the
1718 worst case when `\uXXXX' escapes (6 characters) are replaced by
1719 `\xXXXX;' (7 characters). */
31d4d02b
LC
1720static void
1721unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1722{
1723 char *before, *after;
1724 size_t i, j;
1725 /* The worst case is if the input string contains all 4-digit hex escapes.
1726 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1727 size_t max_out_len = (*lenp * 7) / 6 + 1;
1728 size_t nzeros, ndigits;
1729
4ff2b9f4 1730 before = buf;
d31b9519
MG
1731 after = alloca (max_out_len);
1732 i = 0;
1733 j = 0;
1734 while (i < *lenp)
1735 {
1736 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1737 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1738 {
1739 if (before[i + 1] == 'u')
1740 ndigits = 4;
1741 else if (before[i + 1] == 'U')
1742 ndigits = 8;
1743 else
1744 abort ();
1745
1746 /* Add the R6RS hex escape initial sequence. */
1747 after[j] = '\\';
1748 after[j + 1] = 'x';
1749
1750 /* Move string positions to the start of the hex numbers. */
1751 i += 2;
1752 j += 2;
1753
1754 /* Find the number of initial zeros in this hex number. */
1755 nzeros = 0;
1756 while (before[i + nzeros] == '0' && nzeros < ndigits)
1757 nzeros++;
1758
1759 /* Copy the number, skipping initial zeros, and then move the string
1760 positions. */
1761 if (nzeros == ndigits)
1762 {
1763 after[j] = '0';
1764 i += ndigits;
1765 j += 1;
1766 }
1767 else
1768 {
1769 int pos;
1770 for (pos = 0; pos < ndigits - nzeros; pos++)
1771 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1772 i += ndigits;
1773 j += (ndigits - nzeros);
1774 }
1775
1776 /* Add terminating semicolon. */
1777 after[j] = ';';
1778 j++;
1779 }
1780 else
1781 {
1782 after[j] = before[i];
1783 i++;
1784 j++;
1785 }
1786 }
1787 *lenp = j;
d31b9519
MG
1788 memcpy (before, after, j);
1789}
1790
cf313a94 1791char *
d40e1ca8 1792scm_to_locale_string (SCM str)
cf313a94 1793{
d40e1ca8 1794 return scm_to_locale_stringn (str, NULL);
cf313a94 1795}
d31b9519 1796
c829a427 1797char *
fac32b51 1798scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1799{
889975e5 1800 return scm_to_stringn (str, lenp,
95f5e303 1801 locale_charset (),
889975e5 1802 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1803}
1804
d40e1ca8
AW
1805char *
1806scm_to_latin1_string (SCM str)
1807{
1808 return scm_to_latin1_stringn (str, NULL);
1809}
1810
1811char *
1812scm_to_latin1_stringn (SCM str, size_t *lenp)
e9a35a96 1813#define FUNC_NAME "scm_to_latin1_stringn"
d40e1ca8 1814{
e9a35a96
LC
1815 char *result;
1816
1817 SCM_VALIDATE_STRING (1, str);
1818
1819 if (scm_i_is_narrow_string (str))
1820 {
fe133640
AW
1821 size_t len = scm_i_string_length (str);
1822
e9a35a96 1823 if (lenp)
fe133640 1824 *lenp = len;
e9a35a96 1825
fe133640 1826 result = scm_strndup (scm_i_string_data (str), len);
e9a35a96
LC
1827 }
1828 else
1829 result = scm_to_stringn (str, lenp, NULL,
fe133640 1830 SCM_FAILED_CONVERSION_ERROR);
e9a35a96
LC
1831
1832 return result;
d40e1ca8 1833}
e9a35a96 1834#undef FUNC_NAME
d40e1ca8
AW
1835
1836char *
1837scm_to_utf8_string (SCM str)
1838{
1839 return scm_to_utf8_stringn (str, NULL);
1840}
1841
1842char *
1843scm_to_utf8_stringn (SCM str, size_t *lenp)
1844{
1845 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1846}
1847
647dc1ac
LC
1848scm_t_wchar *
1849scm_to_utf32_string (SCM str)
1850{
1851 return scm_to_utf32_stringn (str, NULL);
1852}
1853
1854scm_t_wchar *
1855scm_to_utf32_stringn (SCM str, size_t *lenp)
1856#define FUNC_NAME "scm_to_utf32_stringn"
1857{
1858 scm_t_wchar *result;
1859
1860 SCM_VALIDATE_STRING (1, str);
1861
1862 if (scm_i_is_narrow_string (str))
1863 result = (scm_t_wchar *)
1864 scm_to_stringn (str, lenp, "UTF-32",
1865 SCM_FAILED_CONVERSION_ERROR);
1866 else
1867 {
1868 size_t len;
1869
1870 len = scm_i_string_length (str);
1871 if (lenp)
1872 *lenp = len;
1873
1874 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1875 memcpy (result, scm_i_string_wide_chars (str),
1876 len * sizeof (scm_t_wchar));
1877 result[len] = 0;
1878 }
1879
1880 return result;
1881}
1882#undef FUNC_NAME
1883
29bcdbb0
LC
1884/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1885 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1886 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1887 defined by HANDLER. */
9c44cd45 1888char *
587a3355 1889scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1890 scm_t_string_failed_conversion_handler handler)
9c44cd45 1891{
9c44cd45
MG
1892 char *buf;
1893 size_t ilen, len, i;
889975e5
MG
1894 int ret;
1895 const char *enc;
4d4528e7 1896
3ee86942 1897 if (!scm_is_string (str))
c829a427 1898 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1899 ilen = scm_i_string_length (str);
1900
1901 if (ilen == 0)
1902 {
1903 buf = scm_malloc (1);
1904 buf[0] = '\0';
1905 if (lenp)
1906 *lenp = 0;
1907 return buf;
1908 }
587a3355 1909
c829a427 1910 if (lenp == NULL)
9c44cd45
MG
1911 for (i = 0; i < ilen; i++)
1912 if (scm_i_string_ref (str, i) == '\0')
1913 scm_misc_error (NULL,
1914 "string contains #\\nul character: ~S",
1915 scm_list_1 (str));
1916
889975e5 1917 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1918 {
889975e5
MG
1919 /* If using native Latin-1 encoding, just copy the string
1920 contents. */
9c44cd45
MG
1921 if (lenp)
1922 {
1923 buf = scm_malloc (ilen);
1924 memcpy (buf, scm_i_string_chars (str), ilen);
1925 *lenp = ilen;
1926 return buf;
1927 }
1928 else
1929 {
1930 buf = scm_malloc (ilen + 1);
1931 memcpy (buf, scm_i_string_chars (str), ilen);
1932 buf[ilen] = '\0';
1933 return buf;
1934 }
c829a427 1935 }
9c44cd45 1936
587a3355 1937
9c44cd45
MG
1938 buf = NULL;
1939 len = 0;
889975e5
MG
1940 enc = encoding;
1941 if (enc == NULL)
1942 enc = "ISO-8859-1";
1943 if (scm_i_is_narrow_string (str))
1944 {
1945 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1946 "ISO-8859-1", enc,
1947 (enum iconv_ilseq_handler) handler, NULL,
1948 &buf, &len);
9c44cd45 1949
889975e5 1950 if (ret != 0)
ef7e4ba3 1951 scm_encoding_error (__func__, errno,
6851d3be
LC
1952 "cannot convert narrow string to output locale",
1953 SCM_BOOL_F,
1954 /* FIXME: Faulty character unknown. */
1955 SCM_BOOL_F);
889975e5
MG
1956 }
1957 else
1958 {
d31b9519 1959 buf = u32_conv_to_encoding (enc,
889975e5 1960 (enum iconv_ilseq_handler) handler,
d31b9519 1961 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
1962 ilen,
1963 NULL,
1964 NULL, &len);
1965 if (buf == NULL)
ef7e4ba3 1966 scm_encoding_error (__func__, errno,
6851d3be
LC
1967 "cannot convert wide string to output locale",
1968 SCM_BOOL_F,
1969 /* FIXME: Faulty character unknown. */
1970 SCM_BOOL_F);
d31b9519
MG
1971 }
1972 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1973 {
1974 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
1975 {
1976 /* The worst case is if the input string contains all 4-digit
1977 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1978 (seven characters). Make BUF large enough to hold
1979 that. */
1980 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 1981 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 1982 }
d31b9519 1983 else
31d4d02b 1984 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
1985
1986 buf = scm_realloc (buf, len);
889975e5 1987 }
9c44cd45 1988 if (lenp)
4d4528e7 1989 *lenp = len;
9c44cd45
MG
1990 else
1991 {
1992 buf = scm_realloc (buf, len + 1);
1993 buf[len] = '\0';
1994 }
24933780 1995
c829a427 1996 scm_remember_upto_here_1 (str);
9c44cd45 1997 return buf;
4d4528e7 1998}
af68e5e5 1999
c829a427
MV
2000size_t
2001scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2002{
2003 size_t len;
9c44cd45 2004 char *result = NULL;
3ee86942 2005 if (!scm_is_string (str))
c829a427 2006 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2007 result = scm_to_locale_stringn (str, &len);
2008
2009 memcpy (buf, result, (len > max_len) ? max_len : len);
2010 free (result);
2011
c829a427
MV
2012 scm_remember_upto_here_1 (str);
2013 return len;
2014}
af68e5e5 2015
a3d7d5d5
LC
2016\f
2017/* Unicode string normalization. */
2018
edb7bb47
JG
2019/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2020 libguile/i18n.c. It would be useful to have this factored out into a more
2021 convenient location, but its use of alloca makes that tricky to do. */
2022
2023static SCM
2024normalize_str (SCM string, uninorm_t form)
2025{
2026 SCM ret;
2027 scm_t_uint32 *w_str;
2028 scm_t_wchar *cbuf;
2029 size_t rlen, len = scm_i_string_length (string);
2030
2031 if (scm_i_is_narrow_string (string))
2032 {
2033 size_t i;
2034 const char *buf = scm_i_string_chars (string);
2035
2036 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2037
2038 for (i = 0; i < len; i ++)
2039 w_str[i] = (unsigned char) buf[i];
2040 w_str[len] = 0;
2041 }
d8164b04
JG
2042 else
2043 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2044
edb7bb47
JG
2045 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2046
190d4b0d 2047 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
edb7bb47
JG
2048 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2049 free (w_str);
d8164b04
JG
2050
2051 scm_i_try_narrow_string (ret);
2052
edb7bb47
JG
2053 return ret;
2054}
2055
2056SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2057 (SCM string),
2058 "Returns the NFC normalized form of @var{string}.")
2059#define FUNC_NAME s_scm_string_normalize_nfc
2060{
2061 SCM_VALIDATE_STRING (1, string);
2062 return normalize_str (string, UNINORM_NFC);
2063}
2064#undef FUNC_NAME
2065
2066SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2067 (SCM string),
2068 "Returns the NFD normalized form of @var{string}.")
2069#define FUNC_NAME s_scm_string_normalize_nfd
2070{
2071 SCM_VALIDATE_STRING (1, string);
2072 return normalize_str (string, UNINORM_NFD);
2073}
2074#undef FUNC_NAME
2075
2076SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2077 (SCM string),
2078 "Returns the NFKC normalized form of @var{string}.")
2079#define FUNC_NAME s_scm_string_normalize_nfkc
2080{
2081 SCM_VALIDATE_STRING (1, string);
2082 return normalize_str (string, UNINORM_NFKC);
2083}
2084#undef FUNC_NAME
2085
2086SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2087 (SCM string),
2088 "Returns the NFKD normalized form of @var{string}.")
2089#define FUNC_NAME s_scm_string_normalize_nfkd
2090{
2091 SCM_VALIDATE_STRING (1, string);
2092 return normalize_str (string, UNINORM_NFKD);
2093}
2094#undef FUNC_NAME
2095
7505c6e0
MW
2096/* converts C scm_array of strings to SCM scm_list of strings.
2097 If argc < 0, a null terminated scm_array is assumed.
2098 The current locale encoding is assumed */
9c44cd45 2099SCM
3ee86942
MV
2100scm_makfromstrs (int argc, char **argv)
2101{
2102 int i = argc;
2103 SCM lst = SCM_EOL;
2104 if (0 > i)
2105 for (i = 0; argv[i]; i++);
2106 while (i--)
2107 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2108 return lst;
2109}
2110
c829a427 2111/* Return a newly allocated array of char pointers to each of the strings
7505c6e0
MW
2112 in args, with a terminating NULL pointer. The strings are encoded using
2113 the current locale. */
c829a427
MV
2114
2115char **
2116scm_i_allocate_string_pointers (SCM list)
2a776823 2117#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 2118{
c829a427 2119 char **result;
7505c6e0 2120 int list_len = scm_ilength (list);
c829a427
MV
2121 int i;
2122
7505c6e0 2123 if (list_len < 0)
c829a427
MV
2124 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2125
7505c6e0 2126 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2a776823 2127 "string pointers");
7505c6e0 2128 result[list_len] = NULL;
c829a427 2129
7505c6e0 2130 /* The list might have been modified in another thread, so
c829a427
MV
2131 we check LIST before each access.
2132 */
7505c6e0 2133 for (i = 0; i < list_len && scm_is_pair (list); i++)
c829a427 2134 {
7505c6e0
MW
2135 SCM str = SCM_CAR (list);
2136 size_t len; /* String length in bytes */
2137 char *c_str = scm_to_locale_stringn (str, &len);
2138
2139 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2140 scm_malloc to allocate the returned string, which must be
2141 explicitly deallocated. This forces us to copy the string a
2142 second time into a new buffer. Ideally there would be variants
2143 of scm_to_*_stringn that can return garbage-collected buffers. */
2144
2145 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2146 memcpy (result[i], c_str, len);
2a776823 2147 result[i][len] = '\0';
7505c6e0 2148 free (c_str);
2a776823 2149
c829a427
MV
2150 list = SCM_CDR (list);
2151 }
2152
c829a427 2153 return result;
af68e5e5 2154}
2a776823 2155#undef FUNC_NAME
24933780 2156
6f14f578
MV
2157void
2158scm_i_get_substring_spec (size_t len,
2159 SCM start, size_t *cstart,
2160 SCM end, size_t *cend)
2161{
2162 if (SCM_UNBNDP (start))
2163 *cstart = 0;
2164 else
2165 *cstart = scm_to_unsigned_integer (start, 0, len);
2166
2167 if (SCM_UNBNDP (end))
2168 *cend = len;
2169 else
2170 *cend = scm_to_unsigned_integer (end, *cstart, len);
2171}
2172
3ee86942
MV
2173#if SCM_ENABLE_DEPRECATED
2174
556d75db
MV
2175/* When these definitions are removed, it becomes reasonable to use
2176 read-only strings for string literals. For that, change the reader
2177 to create string literals with scm_c_substring_read_only instead of
2178 with scm_c_substring_copy.
2179*/
2180
3ee86942 2181int
fe78c51a 2182scm_i_deprecated_stringp (SCM str)
3ee86942
MV
2183{
2184 scm_c_issue_deprecation_warning
2185 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2186
2616f0e0 2187 return scm_is_string (str);
3ee86942
MV
2188}
2189
2190char *
fe78c51a 2191scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2192{
2193 char *chars;
2194
2195 scm_c_issue_deprecation_warning
2196 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2197
2616f0e0
MV
2198 /* We don't accept shared substrings here since they are not
2199 null-terminated.
2200 */
2201 if (IS_SH_STRING (str))
c291b588
LC
2202 scm_misc_error (NULL,
2203 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2204 SCM_EOL);
2205
877f06c3 2206 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2207 error message.
2208 */
2209
2210 if (IS_RO_STRING (str))
c291b588
LC
2211 scm_misc_error (NULL,
2212 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2213 SCM_EOL);
c291b588 2214
2616f0e0 2215 /* The following is still wrong, of course...
3ee86942 2216 */
9c44cd45 2217 str = scm_i_string_start_writing (str);
3ee86942
MV
2218 chars = scm_i_string_writable_chars (str);
2219 scm_i_string_stop_writing ();
2220 return chars;
2221}
2222
2223size_t
fe78c51a 2224scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2225{
2226 scm_c_issue_deprecation_warning
2227 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2228 return scm_c_string_length (str);
2229}
2230
2231#endif
2232
2a610be5
AW
2233static SCM
2234string_handle_ref (scm_t_array_handle *h, size_t index)
2235{
2236 return scm_c_string_ref (h->array, index);
2237}
2238
2239static void
2240string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2241{
2242 scm_c_string_set_x (h->array, index, val);
2243}
2244
2245static void
2246string_get_handle (SCM v, scm_t_array_handle *h)
2247{
2248 h->array = v;
2249 h->ndims = 1;
2250 h->dims = &h->dim0;
2251 h->dim0.lbnd = 0;
2252 h->dim0.ubnd = scm_c_string_length (v) - 1;
2253 h->dim0.inc = 1;
2254 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2255 h->elements = h->writable_elements = NULL;
2256}
2257
c5f17102 2258SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2259 string_handle_ref, string_handle_set,
f65e0168
LC
2260 string_get_handle)
2261SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2262
0f2d19dd
JB
2263void
2264scm_init_strings ()
0f2d19dd 2265{
a7e392c1 2266 scm_nullstr = scm_i_make_string (0, NULL, 0);
7c33806a 2267
a0599745 2268#include "libguile/strings.x"
0f2d19dd
JB
2269}
2270
89e00824
ML
2271
2272/*
2273 Local Variables:
2274 c-file-style: "gnu"
2275 End:
2276*/