Make scm_nullstr mutable
[bpt/guile.git] / libguile / strings.c
CommitLineData
a7e392c1 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
faf2c9d7 32
889975e5
MG
33#include "striconveh.h"
34
a0599745
MD
35#include "libguile/_scm.h"
36#include "libguile/chars.h"
7c33806a 37#include "libguile/root.h"
a0599745 38#include "libguile/strings.h"
a3d7d5d5 39#include "libguile/error.h"
f45eccff 40#include "libguile/generalized-vectors.h"
1afff620 41#include "libguile/deprecation.h"
a0599745 42#include "libguile/validate.h"
d31b9519 43#include "libguile/private-options.h"
1afff620 44
0f2d19dd
JB
45\f
46
47/* {Strings}
48 */
49
3ee86942
MV
50
51/* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 55 * shared, not whether it definitely is.
3ee86942
MV
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
50b1996f 68 *
ba54a202
LC
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
3ee86942
MV
72 */
73
ba54a202
LC
74/* The size in words of the stringbuf header (type tag + size). */
75#define STRINGBUF_HEADER_SIZE 2U
76
77#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
35920c00 79#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 80#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
81
82#define STRINGBUF_TAG scm_tc7_stringbuf
83#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 84#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 85
100e20c7 86#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
100e20c7
LC
89#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 91
100e20c7 92#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 93
9b41542f
LC
94#define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
3ee86942 103
56a3dcd4 104#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
105static size_t lenhist[1001];
106#endif
107
50b1996f
MG
108/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
3ee86942
MV
110static SCM
111make_stringbuf (size_t len)
0f2d19dd 112{
3ee86942
MV
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
ba54a202
LC
120 SCM buf;
121
56a3dcd4 122#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127#endif
0f2d19dd 128
ba54a202
LC
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
3ee86942 138}
e53cc817 139
50b1996f
MG
140/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
9c44cd45
MG
142static SCM
143make_wide_stringbuf (size_t len)
144{
ba54a202
LC
145 SCM buf;
146 size_t raw_len;
147
56a3dcd4 148#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153#endif
154
ba54a202
LC
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
9c44cd45
MG
165}
166
ba54a202
LC
167/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169static SCM
170wide_stringbuf (SCM buf)
9c44cd45 171{
ba54a202 172 SCM new_buf;
9c44cd45
MG
173
174 if (STRINGBUF_WIDE (buf))
ba54a202
LC
175 new_buf = buf;
176 else
9c44cd45 177 {
ba54a202
LC
178 size_t i, len;
179 scm_t_wchar *mem;
9c44cd45 180
ba54a202 181 len = STRINGBUF_LENGTH (buf);
9c44cd45 182
ba54a202 183 new_buf = make_wide_stringbuf (len);
9c44cd45 184
ba54a202 185 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 186 for (i = 0; i < len; i++)
ba54a202 187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 188 mem[len] = 0;
9c44cd45 189 }
ba54a202
LC
190
191 return new_buf;
3ee86942 192}
bd9e24b3 193
ba54a202
LC
194/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196static SCM
587a3355
MG
197narrow_stringbuf (SCM buf)
198{
ba54a202 199 SCM new_buf;
587a3355
MG
200
201 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
587a3355 208
ba54a202
LC
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 211
ba54a202
LC
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
587a3355 216
ba54a202 217 new_buf = make_stringbuf (len);
587a3355 218
ba54a202
LC
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
587a3355
MG
226}
227
9de87eea 228scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 229
ba54a202 230\f
3ee86942
MV
231/* Copy-on-write strings.
232 */
bd9e24b3 233
3ee86942 234#define STRING_TAG scm_tc7_string
bd9e24b3 235
3ee86942
MV
236#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 239
3ee86942
MV
240#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
ed35de72
MV
245/* Read-only strings.
246 */
247
35920c00 248#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
249#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
e1b29f6a
MV
251/* Mutation-sharing substrings
252 */
253
254#define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257/* START and LENGTH as for STRINGs. */
258
259#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
e7efe8e7
AW
261SCM scm_nullstr;
262
50b1996f
MG
263/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
3ee86942 267SCM
190d4b0d 268scm_i_make_string (size_t len, char **charsp, int read_only_p)
3ee86942
MV
269{
270 SCM buf = make_stringbuf (len);
271 SCM res;
272 if (charsp)
f59cf998 273 *charsp = (char *) STRINGBUF_CHARS (buf);
190d4b0d
LC
274 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
275 SCM_UNPACK (buf),
276 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 277 return res;
0f2d19dd
JB
278}
279
50b1996f
MG
280/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
9c44cd45 284SCM
190d4b0d 285scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
9c44cd45
MG
286{
287 SCM buf = make_wide_stringbuf (len);
288 SCM res;
289 if (charsp)
290 *charsp = STRINGBUF_WIDE_CHARS (buf);
190d4b0d
LC
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
9c44cd45
MG
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295}
296
3ee86942
MV
297static void
298validate_substring_args (SCM str, size_t start, size_t end)
299{
300 if (!IS_STRING (str))
301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
302 if (start > STRING_LENGTH (str))
303 scm_out_of_range (NULL, scm_from_size_t (start));
304 if (end > STRING_LENGTH (str) || end < start)
305 scm_out_of_range (NULL, scm_from_size_t (end));
306}
0f2d19dd 307
e1b29f6a
MV
308static inline void
309get_str_buf_start (SCM *str, SCM *buf, size_t *start)
310{
311 *start = STRING_START (*str);
312 if (IS_SH_STRING (*str))
313 {
314 *str = SH_STRING_STRING (*str);
315 *start += STRING_START (*str);
316 }
317 *buf = STRING_STRINGBUF (*str);
318}
319
3ee86942
MV
320SCM
321scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 322{
e1b29f6a
MV
323 SCM buf;
324 size_t str_start;
325 get_str_buf_start (&str, &buf, &str_start);
9de87eea 326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 327 SET_STRINGBUF_SHARED (buf);
9de87eea 328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 329 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
330 (scm_t_bits)str_start + start,
331 (scm_t_bits) end - start);
0f2d19dd
JB
332}
333
ed35de72
MV
334SCM
335scm_i_substring_read_only (SCM str, size_t start, size_t end)
336{
45a9f430
LC
337 SCM buf;
338 size_t str_start;
339 get_str_buf_start (&str, &buf, &str_start);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
341 SET_STRINGBUF_SHARED (buf);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
343 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
344 (scm_t_bits)str_start + start,
345 (scm_t_bits) end - start);
ed35de72
MV
346}
347
3ee86942
MV
348SCM
349scm_i_substring_copy (SCM str, size_t start, size_t end)
350{
351 size_t len = end - start;
edea856c 352 SCM buf, my_buf;
e1b29f6a
MV
353 size_t str_start;
354 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
355 if (scm_i_is_narrow_string (str))
356 {
357 my_buf = make_stringbuf (len);
358 memcpy (STRINGBUF_CHARS (my_buf),
359 STRINGBUF_CHARS (buf) + str_start + start, len);
360 }
361 else
362 {
363 my_buf = make_wide_stringbuf (len);
364 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
365 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
366 + start), len);
367 /* Even though this string is wide, the substring may be narrow.
50b1996f 368 Consider adding code to narrow the string. */
9c44cd45 369 }
3ee86942 370 scm_remember_upto_here_1 (buf);
9c44cd45
MG
371 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
372 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 373}
0f2d19dd 374
e1b29f6a
MV
375SCM
376scm_i_substring_shared (SCM str, size_t start, size_t end)
377{
378 if (start == 0 && end == STRING_LENGTH (str))
379 return str;
380 else
381 {
382 size_t len = end - start;
383 if (IS_SH_STRING (str))
384 {
385 start += STRING_START (str);
386 str = SH_STRING_STRING (str);
387 }
388 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
389 (scm_t_bits)start, (scm_t_bits) len);
390 }
391}
392
3ee86942
MV
393SCM
394scm_c_substring (SCM str, size_t start, size_t end)
395{
396 validate_substring_args (str, start, end);
397 return scm_i_substring (str, start, end);
398}
ee149d03 399
ed35de72
MV
400SCM
401scm_c_substring_read_only (SCM str, size_t start, size_t end)
402{
403 validate_substring_args (str, start, end);
404 return scm_i_substring_read_only (str, start, end);
405}
406
0f2d19dd 407SCM
3ee86942 408scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 409{
3ee86942
MV
410 validate_substring_args (str, start, end);
411 return scm_i_substring_copy (str, start, end);
412}
413
3ee86942
MV
414SCM
415scm_c_substring_shared (SCM str, size_t start, size_t end)
416{
417 validate_substring_args (str, start, end);
418 return scm_i_substring_shared (str, start, end);
419}
0f2d19dd 420
d6c74168 421\f
3ee86942
MV
422/* Internal accessors
423 */
424
50b1996f
MG
425/* Returns the number of characters in STR. This may be different
426 than the memory size of the string storage. */
3ee86942
MV
427size_t
428scm_i_string_length (SCM str)
0f2d19dd 429{
3ee86942 430 return STRING_LENGTH (str);
0f2d19dd
JB
431}
432
50b1996f
MG
433/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
434 encoding. False if it is 'wide', having a 32-bit UCS-4
435 encoding. */
9c44cd45
MG
436int
437scm_i_is_narrow_string (SCM str)
438{
439 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
440}
441
587a3355
MG
442/* Try to coerce a string to be narrow. It if is narrow already, do
443 nothing. If it is wide, shrink it to narrow if none of its
444 characters are above 0xFF. Return true if the string is narrow or
445 was made to be narrow. */
446int
447scm_i_try_narrow_string (SCM str)
448{
ba54a202 449 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
450
451 return scm_i_is_narrow_string (str);
452}
453
100e20c7
LC
454/* Return a pointer to the raw data of the string, which can be either Latin-1
455 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
456const void *
457scm_i_string_data (SCM str)
458{
459 SCM buf;
460 size_t start;
461 const char *data;
462
463 get_str_buf_start (&str, &buf, &start);
464
465 data = STRINGBUF_CONTENTS (buf);
466 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
467
468 return data;
469}
470
50b1996f
MG
471/* Returns a pointer to the 8-bit Latin-1 encoded character array of
472 STR. */
3ee86942
MV
473const char *
474scm_i_string_chars (SCM str)
475{
476 SCM buf;
e1b29f6a
MV
477 size_t start;
478 get_str_buf_start (&str, &buf, &start);
9c44cd45 479 if (scm_i_is_narrow_string (str))
f59cf998 480 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
481 else
482 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
483 scm_list_1 (str));
484 return NULL;
3ee86942 485}
b00418df 486
50b1996f
MG
487/* Returns a pointer to the 32-bit UCS-4 encoded character array of
488 STR. */
9c44cd45
MG
489const scm_t_wchar *
490scm_i_string_wide_chars (SCM str)
491{
492 SCM buf;
493 size_t start;
494
495 get_str_buf_start (&str, &buf, &start);
496 if (!scm_i_is_narrow_string (str))
f59cf998 497 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
498 else
499 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
500 scm_list_1 (str));
501}
502
503/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
504 a new string buffer, so that it can be modified without modifying
50b1996f
MG
505 other strings. Also, lock the string mutex. Later, one must call
506 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
507SCM
508scm_i_string_start_writing (SCM orig_str)
b00418df 509{
ed35de72 510 SCM buf, str = orig_str;
e1b29f6a 511 size_t start;
ed35de72 512
e1b29f6a 513 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
514 if (IS_RO_STRING (str))
515 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
516
9de87eea 517 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
518 if (STRINGBUF_SHARED (buf))
519 {
9c44cd45 520 /* Clone the stringbuf. */
3ee86942
MV
521 size_t len = STRING_LENGTH (str);
522 SCM new_buf;
523
9de87eea 524 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 525
9c44cd45
MG
526 if (scm_i_is_narrow_string (str))
527 {
528 new_buf = make_stringbuf (len);
529 memcpy (STRINGBUF_CHARS (new_buf),
530 STRINGBUF_CHARS (buf) + STRING_START (str), len);
531
532 }
533 else
534 {
535 new_buf = make_wide_stringbuf (len);
536 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
537 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
538 + STRING_START (str)), len);
539 }
3ee86942 540
3ee86942
MV
541 SET_STRING_STRINGBUF (str, new_buf);
542 start -= STRING_START (str);
902578f1
LC
543
544 /* FIXME: The following operations are not atomic, so other threads
545 looking at STR may see an inconsistent state. Nevertheless it can't
546 hurt much since (i) accessing STR while it is being mutated can't
547 yield a crash, and (ii) concurrent accesses to STR should be
548 protected by a mutex at the application level. The latter may not
549 apply when STR != ORIG_STR, though. */
3ee86942 550 SET_STRING_START (str, 0);
902578f1 551 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
552
553 buf = new_buf;
554
9de87eea 555 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 556 }
9c44cd45
MG
557 return orig_str;
558}
3ee86942 559
50b1996f 560/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
561char *
562scm_i_string_writable_chars (SCM str)
563{
564 SCM buf;
565 size_t start;
3ee86942 566
9c44cd45
MG
567 get_str_buf_start (&str, &buf, &start);
568 if (scm_i_is_narrow_string (str))
f59cf998 569 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
570 else
571 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
572 scm_list_1 (str));
573 return NULL;
b00418df
DH
574}
575
50b1996f 576/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
577static scm_t_wchar *
578scm_i_string_writable_wide_chars (SCM str)
579{
580 SCM buf;
581 size_t start;
582
583 get_str_buf_start (&str, &buf, &start);
584 if (!scm_i_is_narrow_string (str))
585 return STRINGBUF_WIDE_CHARS (buf) + start;
586 else
1c7b216f 587 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 588 scm_list_1 (str));
b00418df
DH
589}
590
50b1996f
MG
591/* Unlock the string mutex that was locked when
592 scm_i_string_start_writing was called. */
3ee86942
MV
593void
594scm_i_string_stop_writing (void)
595{
9de87eea 596 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 597}
b00418df 598
50b1996f 599/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
600scm_t_wchar
601scm_i_string_ref (SCM str, size_t x)
602{
603 if (scm_i_is_narrow_string (str))
604 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
605 else
606 return scm_i_string_wide_chars (str)[x];
607}
608
889975e5
MG
609/* Returns index+1 of the first char in STR that matches C, or
610 0 if the char is not found. */
611int
612scm_i_string_contains_char (SCM str, char ch)
613{
614 size_t i;
615 size_t len = scm_i_string_length (str);
616
617 i = 0;
618 if (scm_i_is_narrow_string (str))
619 {
620 while (i < len)
621 {
622 if (scm_i_string_chars (str)[i] == ch)
623 return i+1;
624 i++;
625 }
626 }
627 else
628 {
629 while (i < len)
630 {
631 if (scm_i_string_wide_chars (str)[i]
632 == (unsigned char) ch)
633 return i+1;
634 i++;
635 }
636 }
637 return 0;
638}
639
3f47e526
MG
640int
641scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
642{
643 if (scm_i_is_narrow_string (sstr))
644 {
645 const char *a = scm_i_string_chars (sstr) + start_x;
646 const char *b = cstr;
647 return strncmp (a, b, strlen(b));
648 }
649 else
650 {
651 size_t i;
652 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
653 const char *b = cstr;
654 for (i = 0; i < strlen (b); i++)
655 {
656 if (a[i] != (unsigned char) b[i])
657 return 1;
658 }
659 }
660 return 0;
661}
662
50b1996f 663/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
664void
665scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
666{
667 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 668 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
669
670 if (scm_i_is_narrow_string (str))
671 {
672 char *dst = scm_i_string_writable_chars (str);
587a3355 673 dst[p] = chr;
9c44cd45
MG
674 }
675 else
676 {
677 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
678 dst[p] = chr;
679 }
680}
681
ba54a202 682\f
3ee86942 683/* Symbols.
587a3355 684
3ee86942
MV
685 Basic symbol creation and accessing is done here, the rest is in
686 symbols.[hc]. This has been done to keep stringbufs and the
687 internals of strings and string-like objects confined to this file.
688*/
689
690#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
691
692SCM
6869328b
MV
693scm_i_make_symbol (SCM name, scm_t_bits flags,
694 unsigned long hash, SCM props)
3ee86942
MV
695{
696 SCM buf;
697 size_t start = STRING_START (name);
698 size_t length = STRING_LENGTH (name);
699
700 if (IS_SH_STRING (name))
701 {
702 name = SH_STRING_STRING (name);
703 start += STRING_START (name);
704 }
705 buf = SYMBOL_STRINGBUF (name);
706
707 if (start == 0 && length == STRINGBUF_LENGTH (buf))
708 {
709 /* reuse buf. */
9de87eea 710 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 711 SET_STRINGBUF_SHARED (buf);
9de87eea 712 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
713 }
714 else
715 {
716 /* make new buf. */
9c44cd45
MG
717 if (scm_i_is_narrow_string (name))
718 {
719 SCM new_buf = make_stringbuf (length);
720 memcpy (STRINGBUF_CHARS (new_buf),
721 STRINGBUF_CHARS (buf) + start, length);
722 buf = new_buf;
723 }
724 else
725 {
726 SCM new_buf = make_wide_stringbuf (length);
727 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
728 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
729 length);
730 buf = new_buf;
731 }
3ee86942 732 }
6869328b 733 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
734 (scm_t_bits) hash, SCM_UNPACK (props));
735}
736
fd0a5bbc
HWN
737SCM
738scm_i_c_make_symbol (const char *name, size_t len,
739 scm_t_bits flags, unsigned long hash, SCM props)
740{
741 SCM buf = make_stringbuf (len);
742 memcpy (STRINGBUF_CHARS (buf), name, len);
743
65619ebe
AW
744 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
745 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
746}
747
50b1996f
MG
748/* Returns the number of characters in SYM. This may be different
749 from the memory size of SYM. */
3ee86942
MV
750size_t
751scm_i_symbol_length (SCM sym)
0f2d19dd 752{
3ee86942 753 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
754}
755
071bb6a8
LC
756size_t
757scm_c_symbol_length (SCM sym)
758#define FUNC_NAME "scm_c_symbol_length"
759{
760 SCM_VALIDATE_SYMBOL (1, sym);
761
762 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
763}
764#undef FUNC_NAME
765
50b1996f
MG
766/* True if the name of SYM is stored as a Latin-1 encoded string.
767 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
768int
769scm_i_is_narrow_symbol (SCM sym)
770{
771 SCM buf;
772
773 buf = SYMBOL_STRINGBUF (sym);
774 return !STRINGBUF_WIDE (buf);
775}
776
50b1996f
MG
777/* Returns a pointer to the 8-bit Latin-1 encoded character array that
778 contains the name of SYM. */
3ee86942
MV
779const char *
780scm_i_symbol_chars (SCM sym)
781{
9c44cd45
MG
782 SCM buf;
783
784 buf = SYMBOL_STRINGBUF (sym);
785 if (!STRINGBUF_WIDE (buf))
f59cf998 786 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
787 else
788 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
789 scm_list_1 (sym));
790}
791
50b1996f
MG
792/* Return a pointer to the 32-bit UCS-4-encoded character array of a
793 symbol's name. */
9c44cd45
MG
794const scm_t_wchar *
795scm_i_symbol_wide_chars (SCM sym)
796{
797 SCM buf;
798
799 buf = SYMBOL_STRINGBUF (sym);
800 if (STRINGBUF_WIDE (buf))
f59cf998 801 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
802 else
803 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
804 scm_list_1 (sym));
3ee86942 805}
1cc91f1b 806
be54b15d 807SCM
3ee86942 808scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 809{
3ee86942 810 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 811 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 812 SET_STRINGBUF_SHARED (buf);
9de87eea 813 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 814 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
815 (scm_t_bits)start, (scm_t_bits) end - start);
816}
be54b15d 817
50b1996f 818/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
819scm_t_wchar
820scm_i_symbol_ref (SCM sym, size_t x)
821{
822 if (scm_i_is_narrow_symbol (sym))
823 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
824 else
825 return scm_i_symbol_wide_chars (sym)[x];
826}
827
3ee86942
MV
828/* Debugging
829 */
be54b15d 830
6ce6923b
MG
831SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
832 "Returns an association list containing debugging information\n"
833 "for @var{str}. The association list has the following entries."
834 "@table @code\n"
835 "@item string\n"
836 "The string itself.\n"
837 "@item start\n"
838 "The start index of the string into its stringbuf\n"
839 "@item length\n"
840 "The length of the string\n"
841 "@item shared\n"
842 "If this string is a substring, it returns its parent string.\n"
843 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
844 "@item read-only\n"
845 "@code{#t} if the string is read-only\n"
6ce6923b
MG
846 "@item stringbuf-chars\n"
847 "A new string containing this string's stringbuf's characters\n"
848 "@item stringbuf-length\n"
849 "The number of characters in this stringbuf\n"
850 "@item stringbuf-shared\n"
851 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
852 "@item stringbuf-wide\n"
853 "@code{#t} if this stringbuf's characters are stored in a\n"
854 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
855 "buffer\n"
856 "@end table")
3ee86942
MV
857#define FUNC_NAME s_scm_sys_string_dump
858{
ba54a202 859 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 860 SCM buf;
3ee86942 861 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
862
863 /* String info */
4a655e50 864 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 865 str);
4a655e50 866 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 867 scm_from_size_t (STRING_START (str)));
4a655e50 868 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
869 scm_from_size_t (STRING_LENGTH (str)));
870
3ee86942
MV
871 if (IS_SH_STRING (str))
872 {
4a655e50 873 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
874 SH_STRING_STRING (str));
875 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
876 }
877 else
878 {
4a655e50 879 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
880 SCM_BOOL_F);
881 buf = STRING_STRINGBUF (str);
3ee86942 882 }
9c44cd45 883
88ed5759 884 if (IS_RO_STRING (str))
4a655e50 885 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
886 SCM_BOOL_T);
887 else
4a655e50 888 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 889 SCM_BOOL_F);
587a3355 890
6ce6923b 891 /* Stringbuf info */
6ce6923b
MG
892 if (!STRINGBUF_WIDE (buf))
893 {
894 size_t len = STRINGBUF_LENGTH (buf);
895 char *cbuf;
190d4b0d 896 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 897 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 898 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 899 sbc);
3ee86942 900 }
6ce6923b
MG
901 else
902 {
903 size_t len = STRINGBUF_LENGTH (buf);
904 scm_t_wchar *cbuf;
190d4b0d 905 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
906 u32_cpy ((scm_t_uint32 *) cbuf,
907 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 908 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
909 sbc);
910 }
4a655e50 911 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
912 scm_from_size_t (STRINGBUF_LENGTH (buf)));
913 if (STRINGBUF_SHARED (buf))
4a655e50 914 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
915 SCM_BOOL_T);
916 else
4a655e50 917 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 918 SCM_BOOL_F);
6ce6923b 919 if (STRINGBUF_WIDE (buf))
4a655e50 920 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 921 SCM_BOOL_T);
6ce6923b 922 else
4a655e50 923 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 924 SCM_BOOL_F);
6ce6923b 925
ba54a202 926 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
927}
928#undef FUNC_NAME
929
6ce6923b
MG
930SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
931 "Returns an association list containing debugging information\n"
932 "for @var{sym}. The association list has the following entries."
933 "@table @code\n"
934 "@item symbol\n"
935 "The symbol itself\n"
936 "@item hash\n"
937 "Its hash value\n"
88ed5759
MG
938 "@item interned\n"
939 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
940 "@item stringbuf-chars\n"
941 "A new string containing this symbols's stringbuf's characters\n"
942 "@item stringbuf-length\n"
943 "The number of characters in this stringbuf\n"
944 "@item stringbuf-shared\n"
945 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
946 "@item stringbuf-wide\n"
947 "@code{#t} if this stringbuf's characters are stored in a\n"
948 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
949 "buffer\n"
950 "@end table")
3ee86942
MV
951#define FUNC_NAME s_scm_sys_symbol_dump
952{
ba54a202 953 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 954 SCM buf;
3ee86942 955 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 956 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 957 sym);
4a655e50 958 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 959 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 960 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 961 scm_symbol_interned_p (sym));
6ce6923b
MG
962 buf = SYMBOL_STRINGBUF (sym);
963
964 /* Stringbuf info */
6ce6923b
MG
965 if (!STRINGBUF_WIDE (buf))
966 {
967 size_t len = STRINGBUF_LENGTH (buf);
968 char *cbuf;
190d4b0d 969 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 970 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 971 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
972 sbc);
973 }
9c44cd45 974 else
6ce6923b
MG
975 {
976 size_t len = STRINGBUF_LENGTH (buf);
977 scm_t_wchar *cbuf;
190d4b0d 978 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
979 u32_cpy ((scm_t_uint32 *) cbuf,
980 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 981 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
982 sbc);
983 }
4a655e50 984 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
985 scm_from_size_t (STRINGBUF_LENGTH (buf)));
986 if (STRINGBUF_SHARED (buf))
4a655e50 987 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
988 SCM_BOOL_T);
989 else
4a655e50 990 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 991 SCM_BOOL_F);
6ce6923b 992 if (STRINGBUF_WIDE (buf))
4a655e50 993 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
994 SCM_BOOL_T);
995 else
4a655e50 996 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 997 SCM_BOOL_F);
ba54a202 998 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 999
3ee86942
MV
1000}
1001#undef FUNC_NAME
1002
56a3dcd4 1003#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1004
9c44cd45 1005SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1006#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1007{
1008 int i;
1009 for (i = 0; i < 1000; i++)
1010 if (lenhist[i])
1011 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1012 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1013 return SCM_UNSPECIFIED;
be54b15d
DH
1014}
1015#undef FUNC_NAME
1016
3ee86942
MV
1017#endif
1018
1019\f
1020
1021SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1022 (SCM obj),
1023 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1024#define FUNC_NAME s_scm_string_p
1025{
1026 return scm_from_bool (IS_STRING (obj));
1027}
1028#undef FUNC_NAME
1029
1030
1031SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1032
1033SCM_DEFINE (scm_string, "string", 0, 0, 1,
1034 (SCM chrs),
1035 "@deffnx {Scheme Procedure} list->string chrs\n"
1036 "Return a newly allocated string composed of the arguments,\n"
1037 "@var{chrs}.")
1038#define FUNC_NAME s_scm_string
1039{
9aa27c1a 1040 SCM result = SCM_BOOL_F;
9c44cd45 1041 SCM rest;
3ee86942 1042 size_t len;
9c44cd45
MG
1043 size_t p = 0;
1044 long i;
9aa27c1a 1045 int wide = 0;
3ee86942 1046
9c44cd45
MG
1047 /* Verify that this is a list of chars. */
1048 i = scm_ilength (chrs);
3c7cf7f5 1049 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1050
9c44cd45
MG
1051 len = (size_t) i;
1052 rest = chrs;
3ee86942 1053
9c44cd45 1054 while (len > 0 && scm_is_pair (rest))
3ee86942 1055 {
9c44cd45 1056 SCM elt = SCM_CAR (rest);
3ee86942 1057 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1058 if (SCM_CHAR (elt) > 0xFF)
1059 wide = 1;
9c44cd45
MG
1060 rest = SCM_CDR (rest);
1061 len--;
1062 scm_remember_upto_here_1 (elt);
1063 }
1064
1065 /* Construct a string containing this list of chars. */
1066 len = (size_t) i;
1067 rest = chrs;
1068
9aa27c1a 1069 if (wide == 0)
9c44cd45 1070 {
56a3dcd4
LC
1071 char *buf;
1072
190d4b0d 1073 result = scm_i_make_string (len, NULL, 0);
9aa27c1a 1074 result = scm_i_string_start_writing (result);
56a3dcd4 1075 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1076 while (len > 0 && scm_is_pair (rest))
1077 {
1078 SCM elt = SCM_CAR (rest);
1079 buf[p] = (unsigned char) SCM_CHAR (elt);
1080 p++;
1081 rest = SCM_CDR (rest);
1082 len--;
1083 scm_remember_upto_here_1 (elt);
1084 }
1085 }
1086 else
1087 {
56a3dcd4
LC
1088 scm_t_wchar *buf;
1089
190d4b0d 1090 result = scm_i_make_wide_string (len, NULL, 0);
9aa27c1a 1091 result = scm_i_string_start_writing (result);
56a3dcd4 1092 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1093 while (len > 0 && scm_is_pair (rest))
1094 {
1095 SCM elt = SCM_CAR (rest);
1096 buf[p] = SCM_CHAR (elt);
1097 p++;
1098 rest = SCM_CDR (rest);
1099 len--;
1100 scm_remember_upto_here_1 (elt);
1101 }
3ee86942 1102 }
9c44cd45
MG
1103 scm_i_string_stop_writing ();
1104
3ee86942
MV
1105 if (len > 0)
1106 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1107 if (!scm_is_null (rest))
3ee86942
MV
1108 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1109
1110 return result;
1111}
1112#undef FUNC_NAME
be54b15d 1113
3b3b36dd 1114SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1115 (SCM k, SCM chr),
0d26a824
MG
1116 "Return a newly allocated string of\n"
1117 "length @var{k}. If @var{chr} is given, then all elements of\n"
1118 "the string are initialized to @var{chr}, otherwise the contents\n"
3ef6650d 1119 "of the @var{string} are all set to @var{#\nul}.")
1bbd0b84 1120#define FUNC_NAME s_scm_make_string
0f2d19dd 1121{
3ee86942
MV
1122 return scm_c_make_string (scm_to_size_t (k), chr);
1123}
1124#undef FUNC_NAME
1125
1126SCM
1127scm_c_make_string (size_t len, SCM chr)
1128#define FUNC_NAME NULL
1129{
9c44cd45 1130 size_t p;
3ef6650d 1131 char *contents = NULL;
190d4b0d 1132 SCM res = scm_i_make_string (len, &contents, 0);
cb0d8be2 1133
3ef6650d
AW
1134 /* If no char is given, initialize string contents to NULL. */
1135 if (SCM_UNBNDP (chr))
1136 memset (contents, 0, len);
1137 else
e11e83f3 1138 {
3ee86942 1139 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1140 res = scm_i_string_start_writing (res);
1141 for (p = 0; p < len; p++)
1142 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1143 scm_i_string_stop_writing ();
0f2d19dd 1144 }
e11e83f3
MV
1145
1146 return res;
0f2d19dd 1147}
1bbd0b84 1148#undef FUNC_NAME
0f2d19dd 1149
3b3b36dd 1150SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1151 (SCM string),
1152 "Return the number of characters in @var{string}.")
1bbd0b84 1153#define FUNC_NAME s_scm_string_length
0f2d19dd 1154{
d1ca2c64 1155 SCM_VALIDATE_STRING (1, string);
3ee86942 1156 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1157}
1bbd0b84 1158#undef FUNC_NAME
0f2d19dd 1159
f8ba2bb9 1160SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1161 (SCM string),
1162 "Return the bytes used to represent a character in @var{string}."
1163 "This will return 1 or 4.")
f8ba2bb9 1164#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1165{
1166 SCM_VALIDATE_STRING (1, string);
1167 if (!scm_i_is_narrow_string (string))
1168 return scm_from_int (4);
1169
1170 return scm_from_int (1);
1171}
1172#undef FUNC_NAME
1173
3ee86942
MV
1174size_t
1175scm_c_string_length (SCM string)
1176{
1177 if (!IS_STRING (string))
1178 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1179 return STRING_LENGTH (string);
1180}
1181
bd9e24b3 1182SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1183 (SCM str, SCM k),
9c44cd45
MG
1184 "Return character @var{k} of @var{str} using zero-origin\n"
1185 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1186#define FUNC_NAME s_scm_string_ref
0f2d19dd 1187{
3ae3166b 1188 size_t len;
a55c2b68 1189 unsigned long idx;
bd9e24b3 1190
d1ca2c64 1191 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1192
1193 len = scm_i_string_length (str);
1194 if (SCM_LIKELY (len > 0))
1195 idx = scm_to_unsigned_integer (k, 0, len - 1);
1196 else
1197 scm_out_of_range (NULL, k);
1198
9c44cd45
MG
1199 if (scm_i_is_narrow_string (str))
1200 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1201 else
1202 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1203}
1bbd0b84 1204#undef FUNC_NAME
0f2d19dd 1205
3ee86942
MV
1206SCM
1207scm_c_string_ref (SCM str, size_t p)
1208{
1209 if (p >= scm_i_string_length (str))
1210 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1211 if (scm_i_is_narrow_string (str))
1212 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1213 else
1214 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1215
3ee86942 1216}
f0942910 1217
3b3b36dd 1218SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1219 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1220 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1221 "an unspecified value. @var{k} must be a valid index of\n"
1222 "@var{str}.")
1bbd0b84 1223#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1224{
3ae3166b 1225 size_t len;
a55c2b68
MV
1226 unsigned long idx;
1227
f0942910 1228 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1229
1230 len = scm_i_string_length (str);
1231 if (SCM_LIKELY (len > 0))
1232 idx = scm_to_unsigned_integer (k, 0, len - 1);
1233 else
1234 scm_out_of_range (NULL, k);
1235
34d19ef6 1236 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1237 str = scm_i_string_start_writing (str);
1238 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1239 scm_i_string_stop_writing ();
1240
0f2d19dd
JB
1241 return SCM_UNSPECIFIED;
1242}
1bbd0b84 1243#undef FUNC_NAME
0f2d19dd 1244
3ee86942
MV
1245void
1246scm_c_string_set_x (SCM str, size_t p, SCM chr)
1247{
1248 if (p >= scm_i_string_length (str))
1249 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1250 str = scm_i_string_start_writing (str);
1251 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1252 scm_i_string_stop_writing ();
3ee86942 1253}
0f2d19dd 1254
3b3b36dd 1255SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1256 (SCM str, SCM start, SCM end),
1257 "Return a newly allocated string formed from the characters\n"
1258 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1259 "ending with index @var{end} (exclusive).\n"
1260 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1261 "exact integers satisfying:\n\n"
1262 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1263#define FUNC_NAME s_scm_substring
0f2d19dd 1264{
3ee86942 1265 size_t len, from, to;
685c0d71 1266
d1ca2c64 1267 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1268 len = scm_i_string_length (str);
1269 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1270 if (SCM_UNBNDP (end))
3ee86942 1271 to = len;
a55c2b68 1272 else
3ee86942
MV
1273 to = scm_to_unsigned_integer (end, from, len);
1274 return scm_i_substring (str, from, to);
0f2d19dd 1275}
1bbd0b84 1276#undef FUNC_NAME
0f2d19dd 1277
ed35de72
MV
1278SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1279 (SCM str, SCM start, SCM end),
1280 "Return a newly allocated string formed from the characters\n"
1281 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1282 "ending with index @var{end} (exclusive).\n"
1283 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1284 "exact integers satisfying:\n"
1285 "\n"
1286 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1287 "\n"
1288 "The returned string is read-only.\n")
1289#define FUNC_NAME s_scm_substring_read_only
1290{
1291 size_t len, from, to;
1292
1293 SCM_VALIDATE_STRING (1, str);
1294 len = scm_i_string_length (str);
1295 from = scm_to_unsigned_integer (start, 0, len);
1296 if (SCM_UNBNDP (end))
1297 to = len;
1298 else
1299 to = scm_to_unsigned_integer (end, from, len);
1300 return scm_i_substring_read_only (str, from, to);
1301}
1302#undef FUNC_NAME
1303
3ee86942
MV
1304SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1305 (SCM str, SCM start, SCM end),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312#define FUNC_NAME s_scm_substring_copy
1313{
e1b29f6a
MV
1314 /* For the Scheme version, START is mandatory, but for the C
1315 version, it is optional. See scm_string_copy in srfi-13.c for a
1316 rationale.
1317 */
1318
1319 size_t from, to;
3ee86942
MV
1320
1321 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1322 scm_i_get_substring_spec (scm_i_string_length (str),
1323 start, &from, end, &to);
3ee86942
MV
1324 return scm_i_substring_copy (str, from, to);
1325}
1326#undef FUNC_NAME
1327
1328SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1329 (SCM str, SCM start, SCM end),
1330 "Return string that indirectly refers to the characters\n"
1331 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1332 "ending with index @var{end} (exclusive).\n"
1333 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1334 "exact integers satisfying:\n\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1336#define FUNC_NAME s_scm_substring_shared
1337{
1338 size_t len, from, to;
1339
1340 SCM_VALIDATE_STRING (1, str);
1341 len = scm_i_string_length (str);
1342 from = scm_to_unsigned_integer (start, 0, len);
1343 if (SCM_UNBNDP (end))
1344 to = len;
1345 else
1346 to = scm_to_unsigned_integer (end, from, len);
1347 return scm_i_substring_shared (str, from, to);
1348}
1349#undef FUNC_NAME
685c0d71 1350
3b3b36dd 1351SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1352 (SCM args),
9c44cd45 1353 "Return a newly allocated string whose characters form the\n"
0d26a824 1354 "concatenation of the given strings, @var{args}.")
1bbd0b84 1355#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1356{
1357 SCM res;
9c44cd45
MG
1358 size_t len = 0;
1359 int wide = 0;
c829a427 1360 SCM l, s;
bd4911ef 1361 size_t i;
9909c395
MG
1362 union
1363 {
1364 char *narrow;
1365 scm_t_wchar *wide;
1366 } data;
af45e3b0
DH
1367
1368 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1369 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1370 {
1371 s = SCM_CAR (l);
1372 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1373 len += scm_i_string_length (s);
1374 if (!scm_i_is_narrow_string (s))
1375 wide = 1;
c829a427 1376 }
9909c395 1377 data.narrow = NULL;
9c44cd45 1378 if (!wide)
190d4b0d 1379 res = scm_i_make_string (len, &data.narrow, 0);
9c44cd45 1380 else
190d4b0d 1381 res = scm_i_make_wide_string (len, &data.wide, 0);
9c44cd45
MG
1382
1383 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1384 {
edea856c 1385 size_t len;
c829a427 1386 s = SCM_CAR (l);
3ee86942 1387 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1388 len = scm_i_string_length (s);
9c44cd45
MG
1389 if (!wide)
1390 {
9909c395
MG
1391 memcpy (data.narrow, scm_i_string_chars (s), len);
1392 data.narrow += len;
9c44cd45
MG
1393 }
1394 else
1395 {
1396 if (scm_i_is_narrow_string (s))
1397 {
1398 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1399 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1400 }
1401 else
9909c395 1402 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1403 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1404 data.wide += len;
9c44cd45 1405 }
c829a427
MV
1406 scm_remember_upto_here_1 (s);
1407 }
0f2d19dd
JB
1408 return res;
1409}
1bbd0b84 1410#undef FUNC_NAME
0f2d19dd 1411
24933780 1412
a3d7d5d5 1413\f
c62da8f8 1414/* Charset conversion error handling. */
a3d7d5d5
LC
1415
1416SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
c62da8f8
LC
1417SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1418
6851d3be
LC
1419/* Raise an exception informing that character CHR could not be written
1420 to PORT in its current encoding. */
d14418a5 1421void
ef7e4ba3 1422scm_encoding_error (const char *subr, int err, const char *message,
6851d3be 1423 SCM port, SCM chr)
ef7e4ba3 1424{
ef7e4ba3 1425 scm_throw (scm_encoding_error_key,
d050ef66
AW
1426 scm_list_n (scm_from_latin1_string (subr),
1427 scm_from_latin1_string (message),
ef7e4ba3 1428 scm_from_int (err),
6851d3be 1429 port, chr,
ef7e4ba3 1430 SCM_UNDEFINED));
a3d7d5d5
LC
1431}
1432
c62da8f8
LC
1433/* Raise an exception informing of an encoding error on PORT. This
1434 means that a character could not be written in PORT's encoding. */
1435void
1436scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1437{
1438 scm_throw (scm_decoding_error_key,
d050ef66
AW
1439 scm_list_n (scm_from_latin1_string (subr),
1440 scm_from_latin1_string (message),
c62da8f8
LC
1441 scm_from_int (err),
1442 port,
1443 SCM_UNDEFINED));
1444}
1445
1446\f
1447/* String conversion to/from C. */
1448
fac32b51 1449SCM
587a3355
MG
1450scm_from_stringn (const char *str, size_t len, const char *encoding,
1451 scm_t_string_failed_conversion_handler handler)
1452{
1453 size_t u32len, i;
1454 scm_t_wchar *u32;
1455 int wide = 0;
1456 SCM res;
1457
d40e1ca8 1458 /* The order of these checks is important. */
a574564c 1459 if (!str && len != 0)
d40e1ca8
AW
1460 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1461 if (len == (size_t) -1)
1462 len = strlen (str);
a574564c
AW
1463 if (len == 0)
1464 return scm_nullstr;
fac32b51 1465
889975e5
MG
1466 if (encoding == NULL)
1467 {
1468 /* If encoding is null, use Latin-1. */
1469 char *buf;
190d4b0d 1470 res = scm_i_make_string (len, &buf, 0);
889975e5
MG
1471 memcpy (buf, str, len);
1472 return res;
1473 }
1474
587a3355
MG
1475 u32len = 0;
1476 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1477 (enum iconv_ilseq_handler)
1478 handler,
1479 str, len,
1480 NULL,
1481 NULL, &u32len);
1482
ef7e4ba3 1483 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1484 {
ef7e4ba3
LC
1485 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1486 handler. */
1487 SCM bv;
1488 signed char *buf;
1489
1490 buf = scm_gc_malloc_pointerless (len, "bytevector");
1491 memcpy (buf, str, len);
fb031aba 1492 bv = scm_c_take_gc_bytevector (buf, len);
ef7e4ba3 1493
c62da8f8
LC
1494 scm_decoding_error (__func__, errno,
1495 "input locale conversion error", bv);
587a3355
MG
1496 }
1497
1498 i = 0;
1499 while (i < u32len)
1500 if (u32[i++] > 0xFF)
1501 {
1502 wide = 1;
1503 break;
1504 }
1505
1506 if (!wide)
1507 {
1508 char *dst;
190d4b0d 1509 res = scm_i_make_string (u32len, &dst, 0);
587a3355
MG
1510 for (i = 0; i < u32len; i ++)
1511 dst[i] = (unsigned char) u32[i];
1512 dst[u32len] = '\0';
1513 }
1514 else
1515 {
1516 scm_t_wchar *wdst;
190d4b0d 1517 res = scm_i_make_wide_string (u32len, &wdst, 0);
587a3355
MG
1518 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1519 wdst[u32len] = 0;
1520 }
1521
1522 free (u32);
1523 return res;
1524}
1525
cf313a94 1526SCM
d40e1ca8 1527scm_from_locale_string (const char *str)
cf313a94 1528{
d40e1ca8 1529 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1530}
1531
c829a427
MV
1532SCM
1533scm_from_locale_stringn (const char *str, size_t len)
1534{
95f5e303
AW
1535 return scm_from_stringn (str, len, locale_charset (),
1536 scm_i_get_conversion_strategy (SCM_BOOL_F));
c829a427 1537}
4d4528e7 1538
c829a427 1539SCM
d40e1ca8 1540scm_from_latin1_string (const char *str)
4d4528e7 1541{
d40e1ca8
AW
1542 return scm_from_latin1_stringn (str, -1);
1543}
9c44cd45 1544
d40e1ca8
AW
1545SCM
1546scm_from_latin1_stringn (const char *str, size_t len)
1547{
e9a35a96
LC
1548 char *buf;
1549 SCM result;
1550
1551 if (len == (size_t) -1)
1552 len = strlen (str);
1553
1554 /* Make a narrow string and copy STR as is. */
190d4b0d 1555 result = scm_i_make_string (len, &buf, 0);
e9a35a96
LC
1556 memcpy (buf, str, len);
1557
1558 return result;
c829a427 1559}
4d4528e7 1560
587a3355 1561SCM
d40e1ca8 1562scm_from_utf8_string (const char *str)
587a3355 1563{
d40e1ca8
AW
1564 return scm_from_utf8_stringn (str, -1);
1565}
1566
1567SCM
1568scm_from_utf8_stringn (const char *str, size_t len)
1569{
1570 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
587a3355
MG
1571}
1572
647dc1ac
LC
1573SCM
1574scm_from_utf32_string (const scm_t_wchar *str)
1575{
1576 return scm_from_utf32_stringn (str, -1);
1577}
1578
1579SCM
1580scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1581{
1582 SCM result;
1583 scm_t_wchar *buf;
1584
1585 if (len == (size_t) -1)
1586 len = u32_strlen ((uint32_t *) str);
1587
190d4b0d 1588 result = scm_i_make_wide_string (len, &buf, 0);
647dc1ac
LC
1589 memcpy (buf, str, len * sizeof (scm_t_wchar));
1590 scm_i_try_narrow_string (result);
1591
1592 return result;
1593}
1594
50b1996f
MG
1595/* Create a new scheme string from the C string STR. The memory of
1596 STR may be used directly as storage for the new string. */
13a94556
LC
1597/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1598 would be to register a finalizer to eventually free(3) STR, which isn't
1599 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1600SCM
1601scm_take_locale_stringn (char *str, size_t len)
1602{
13a94556 1603 SCM res;
48ddf0d9 1604
13a94556
LC
1605 res = scm_from_locale_stringn (str, len);
1606 free (str);
c829a427 1607
c829a427
MV
1608 return res;
1609}
1610
48ddf0d9
KR
1611SCM
1612scm_take_locale_string (char *str)
1613{
1614 return scm_take_locale_stringn (str, -1);
1615}
1616
f1ee6d54
LC
1617/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1618 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1619 Set *LENP to the size of the resulting string.
1620
1621 FIXME: This is a hack we should get rid of. See
1622 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1623 for details. */
1624static void
1625unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1626{
1627 char *before, *after;
1628 size_t i, j;
1629
4ff2b9f4
LC
1630 before = buf;
1631 after = buf;
9c44cd45
MG
1632 i = 0;
1633 j = 0;
1634 while (i < *lenp)
1635 {
1636 if ((i <= *lenp - 6)
1637 && before[i] == '\\'
1638 && before[i + 1] == 'u'
1639 && before[i + 2] == '0' && before[i + 3] == '0')
1640 {
1641 /* Convert \u00NN to \xNN */
1642 after[j] = '\\';
1643 after[j + 1] = 'x';
30a6b9ca
MG
1644 after[j + 2] = tolower ((int) before[i + 4]);
1645 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1646 i += 6;
1647 j += 4;
1648 }
1649 else if ((i <= *lenp - 10)
1650 && before[i] == '\\'
1651 && before[i + 1] == 'U'
1652 && before[i + 2] == '0' && before[i + 3] == '0')
1653 {
1654 /* Convert \U00NNNNNN to \UNNNNNN */
1655 after[j] = '\\';
1656 after[j + 1] = 'U';
30a6b9ca
MG
1657 after[j + 2] = tolower ((int) before[i + 4]);
1658 after[j + 3] = tolower ((int) before[i + 5]);
1659 after[j + 4] = tolower ((int) before[i + 6]);
1660 after[j + 5] = tolower ((int) before[i + 7]);
1661 after[j + 6] = tolower ((int) before[i + 8]);
1662 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1663 i += 10;
1664 j += 8;
1665 }
1666 else
1667 {
1668 after[j] = before[i];
1669 i++;
1670 j++;
1671 }
1672 }
1673 *lenp = j;
9c44cd45
MG
1674}
1675
f1ee6d54
LC
1676/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1677 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1678 of the resulting string. BUF must be large enough to handle the
1679 worst case when `\uXXXX' escapes (6 characters) are replaced by
1680 `\xXXXX;' (7 characters). */
31d4d02b
LC
1681static void
1682unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1683{
1684 char *before, *after;
1685 size_t i, j;
1686 /* The worst case is if the input string contains all 4-digit hex escapes.
1687 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1688 size_t max_out_len = (*lenp * 7) / 6 + 1;
1689 size_t nzeros, ndigits;
1690
4ff2b9f4 1691 before = buf;
d31b9519
MG
1692 after = alloca (max_out_len);
1693 i = 0;
1694 j = 0;
1695 while (i < *lenp)
1696 {
1697 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1698 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1699 {
1700 if (before[i + 1] == 'u')
1701 ndigits = 4;
1702 else if (before[i + 1] == 'U')
1703 ndigits = 8;
1704 else
1705 abort ();
1706
1707 /* Add the R6RS hex escape initial sequence. */
1708 after[j] = '\\';
1709 after[j + 1] = 'x';
1710
1711 /* Move string positions to the start of the hex numbers. */
1712 i += 2;
1713 j += 2;
1714
1715 /* Find the number of initial zeros in this hex number. */
1716 nzeros = 0;
1717 while (before[i + nzeros] == '0' && nzeros < ndigits)
1718 nzeros++;
1719
1720 /* Copy the number, skipping initial zeros, and then move the string
1721 positions. */
1722 if (nzeros == ndigits)
1723 {
1724 after[j] = '0';
1725 i += ndigits;
1726 j += 1;
1727 }
1728 else
1729 {
1730 int pos;
1731 for (pos = 0; pos < ndigits - nzeros; pos++)
1732 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1733 i += ndigits;
1734 j += (ndigits - nzeros);
1735 }
1736
1737 /* Add terminating semicolon. */
1738 after[j] = ';';
1739 j++;
1740 }
1741 else
1742 {
1743 after[j] = before[i];
1744 i++;
1745 j++;
1746 }
1747 }
1748 *lenp = j;
d31b9519
MG
1749 memcpy (before, after, j);
1750}
1751
cf313a94 1752char *
d40e1ca8 1753scm_to_locale_string (SCM str)
cf313a94 1754{
d40e1ca8 1755 return scm_to_locale_stringn (str, NULL);
cf313a94 1756}
d31b9519 1757
c829a427 1758char *
fac32b51 1759scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1760{
889975e5 1761 return scm_to_stringn (str, lenp,
95f5e303 1762 locale_charset (),
889975e5 1763 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1764}
1765
d40e1ca8
AW
1766char *
1767scm_to_latin1_string (SCM str)
1768{
1769 return scm_to_latin1_stringn (str, NULL);
1770}
1771
1772char *
1773scm_to_latin1_stringn (SCM str, size_t *lenp)
e9a35a96 1774#define FUNC_NAME "scm_to_latin1_stringn"
d40e1ca8 1775{
e9a35a96
LC
1776 char *result;
1777
1778 SCM_VALIDATE_STRING (1, str);
1779
1780 if (scm_i_is_narrow_string (str))
1781 {
fe133640
AW
1782 size_t len = scm_i_string_length (str);
1783
e9a35a96 1784 if (lenp)
fe133640 1785 *lenp = len;
e9a35a96 1786
fe133640 1787 result = scm_strndup (scm_i_string_data (str), len);
e9a35a96
LC
1788 }
1789 else
1790 result = scm_to_stringn (str, lenp, NULL,
fe133640 1791 SCM_FAILED_CONVERSION_ERROR);
e9a35a96
LC
1792
1793 return result;
d40e1ca8 1794}
e9a35a96 1795#undef FUNC_NAME
d40e1ca8
AW
1796
1797char *
1798scm_to_utf8_string (SCM str)
1799{
1800 return scm_to_utf8_stringn (str, NULL);
1801}
1802
1803char *
1804scm_to_utf8_stringn (SCM str, size_t *lenp)
1805{
1806 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1807}
1808
647dc1ac
LC
1809scm_t_wchar *
1810scm_to_utf32_string (SCM str)
1811{
1812 return scm_to_utf32_stringn (str, NULL);
1813}
1814
1815scm_t_wchar *
1816scm_to_utf32_stringn (SCM str, size_t *lenp)
1817#define FUNC_NAME "scm_to_utf32_stringn"
1818{
1819 scm_t_wchar *result;
1820
1821 SCM_VALIDATE_STRING (1, str);
1822
1823 if (scm_i_is_narrow_string (str))
1824 result = (scm_t_wchar *)
1825 scm_to_stringn (str, lenp, "UTF-32",
1826 SCM_FAILED_CONVERSION_ERROR);
1827 else
1828 {
1829 size_t len;
1830
1831 len = scm_i_string_length (str);
1832 if (lenp)
1833 *lenp = len;
1834
1835 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1836 memcpy (result, scm_i_string_wide_chars (str),
1837 len * sizeof (scm_t_wchar));
1838 result[len] = 0;
1839 }
1840
1841 return result;
1842}
1843#undef FUNC_NAME
1844
29bcdbb0
LC
1845/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1846 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1847 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1848 defined by HANDLER. */
9c44cd45 1849char *
587a3355 1850scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1851 scm_t_string_failed_conversion_handler handler)
9c44cd45 1852{
9c44cd45
MG
1853 char *buf;
1854 size_t ilen, len, i;
889975e5
MG
1855 int ret;
1856 const char *enc;
4d4528e7 1857
3ee86942 1858 if (!scm_is_string (str))
c829a427 1859 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1860 ilen = scm_i_string_length (str);
1861
1862 if (ilen == 0)
1863 {
1864 buf = scm_malloc (1);
1865 buf[0] = '\0';
1866 if (lenp)
1867 *lenp = 0;
1868 return buf;
1869 }
587a3355 1870
c829a427 1871 if (lenp == NULL)
9c44cd45
MG
1872 for (i = 0; i < ilen; i++)
1873 if (scm_i_string_ref (str, i) == '\0')
1874 scm_misc_error (NULL,
1875 "string contains #\\nul character: ~S",
1876 scm_list_1 (str));
1877
889975e5 1878 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1879 {
889975e5
MG
1880 /* If using native Latin-1 encoding, just copy the string
1881 contents. */
9c44cd45
MG
1882 if (lenp)
1883 {
1884 buf = scm_malloc (ilen);
1885 memcpy (buf, scm_i_string_chars (str), ilen);
1886 *lenp = ilen;
1887 return buf;
1888 }
1889 else
1890 {
1891 buf = scm_malloc (ilen + 1);
1892 memcpy (buf, scm_i_string_chars (str), ilen);
1893 buf[ilen] = '\0';
1894 return buf;
1895 }
c829a427 1896 }
9c44cd45 1897
587a3355 1898
9c44cd45
MG
1899 buf = NULL;
1900 len = 0;
889975e5
MG
1901 enc = encoding;
1902 if (enc == NULL)
1903 enc = "ISO-8859-1";
1904 if (scm_i_is_narrow_string (str))
1905 {
1906 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1907 "ISO-8859-1", enc,
1908 (enum iconv_ilseq_handler) handler, NULL,
1909 &buf, &len);
9c44cd45 1910
889975e5 1911 if (ret != 0)
ef7e4ba3 1912 scm_encoding_error (__func__, errno,
6851d3be
LC
1913 "cannot convert narrow string to output locale",
1914 SCM_BOOL_F,
1915 /* FIXME: Faulty character unknown. */
1916 SCM_BOOL_F);
889975e5
MG
1917 }
1918 else
1919 {
d31b9519 1920 buf = u32_conv_to_encoding (enc,
889975e5 1921 (enum iconv_ilseq_handler) handler,
d31b9519 1922 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
1923 ilen,
1924 NULL,
1925 NULL, &len);
1926 if (buf == NULL)
ef7e4ba3 1927 scm_encoding_error (__func__, errno,
6851d3be
LC
1928 "cannot convert wide string to output locale",
1929 SCM_BOOL_F,
1930 /* FIXME: Faulty character unknown. */
1931 SCM_BOOL_F);
d31b9519
MG
1932 }
1933 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1934 {
1935 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
1936 {
1937 /* The worst case is if the input string contains all 4-digit
1938 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1939 (seven characters). Make BUF large enough to hold
1940 that. */
1941 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 1942 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 1943 }
d31b9519 1944 else
31d4d02b 1945 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
1946
1947 buf = scm_realloc (buf, len);
889975e5 1948 }
9c44cd45 1949 if (lenp)
4d4528e7 1950 *lenp = len;
9c44cd45
MG
1951 else
1952 {
1953 buf = scm_realloc (buf, len + 1);
1954 buf[len] = '\0';
1955 }
24933780 1956
c829a427 1957 scm_remember_upto_here_1 (str);
9c44cd45 1958 return buf;
4d4528e7 1959}
af68e5e5 1960
c829a427
MV
1961size_t
1962scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1963{
1964 size_t len;
9c44cd45 1965 char *result = NULL;
3ee86942 1966 if (!scm_is_string (str))
c829a427 1967 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1968 result = scm_to_locale_stringn (str, &len);
1969
1970 memcpy (buf, result, (len > max_len) ? max_len : len);
1971 free (result);
1972
c829a427
MV
1973 scm_remember_upto_here_1 (str);
1974 return len;
1975}
af68e5e5 1976
a3d7d5d5
LC
1977\f
1978/* Unicode string normalization. */
1979
edb7bb47
JG
1980/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1981 libguile/i18n.c. It would be useful to have this factored out into a more
1982 convenient location, but its use of alloca makes that tricky to do. */
1983
1984static SCM
1985normalize_str (SCM string, uninorm_t form)
1986{
1987 SCM ret;
1988 scm_t_uint32 *w_str;
1989 scm_t_wchar *cbuf;
1990 size_t rlen, len = scm_i_string_length (string);
1991
1992 if (scm_i_is_narrow_string (string))
1993 {
1994 size_t i;
1995 const char *buf = scm_i_string_chars (string);
1996
1997 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1998
1999 for (i = 0; i < len; i ++)
2000 w_str[i] = (unsigned char) buf[i];
2001 w_str[len] = 0;
2002 }
d8164b04
JG
2003 else
2004 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2005
edb7bb47
JG
2006 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2007
190d4b0d 2008 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
edb7bb47
JG
2009 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2010 free (w_str);
d8164b04
JG
2011
2012 scm_i_try_narrow_string (ret);
2013
edb7bb47
JG
2014 return ret;
2015}
2016
2017SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2018 (SCM string),
2019 "Returns the NFC normalized form of @var{string}.")
2020#define FUNC_NAME s_scm_string_normalize_nfc
2021{
2022 SCM_VALIDATE_STRING (1, string);
2023 return normalize_str (string, UNINORM_NFC);
2024}
2025#undef FUNC_NAME
2026
2027SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2028 (SCM string),
2029 "Returns the NFD normalized form of @var{string}.")
2030#define FUNC_NAME s_scm_string_normalize_nfd
2031{
2032 SCM_VALIDATE_STRING (1, string);
2033 return normalize_str (string, UNINORM_NFD);
2034}
2035#undef FUNC_NAME
2036
2037SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2038 (SCM string),
2039 "Returns the NFKC normalized form of @var{string}.")
2040#define FUNC_NAME s_scm_string_normalize_nfkc
2041{
2042 SCM_VALIDATE_STRING (1, string);
2043 return normalize_str (string, UNINORM_NFKC);
2044}
2045#undef FUNC_NAME
2046
2047SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2048 (SCM string),
2049 "Returns the NFKD normalized form of @var{string}.")
2050#define FUNC_NAME s_scm_string_normalize_nfkd
2051{
2052 SCM_VALIDATE_STRING (1, string);
2053 return normalize_str (string, UNINORM_NFKD);
2054}
2055#undef FUNC_NAME
2056
7505c6e0
MW
2057/* converts C scm_array of strings to SCM scm_list of strings.
2058 If argc < 0, a null terminated scm_array is assumed.
2059 The current locale encoding is assumed */
9c44cd45 2060SCM
3ee86942
MV
2061scm_makfromstrs (int argc, char **argv)
2062{
2063 int i = argc;
2064 SCM lst = SCM_EOL;
2065 if (0 > i)
2066 for (i = 0; argv[i]; i++);
2067 while (i--)
2068 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2069 return lst;
2070}
2071
c829a427 2072/* Return a newly allocated array of char pointers to each of the strings
7505c6e0
MW
2073 in args, with a terminating NULL pointer. The strings are encoded using
2074 the current locale. */
c829a427
MV
2075
2076char **
2077scm_i_allocate_string_pointers (SCM list)
2a776823 2078#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 2079{
c829a427 2080 char **result;
7505c6e0 2081 int list_len = scm_ilength (list);
c829a427
MV
2082 int i;
2083
7505c6e0 2084 if (list_len < 0)
c829a427
MV
2085 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2086
7505c6e0 2087 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2a776823 2088 "string pointers");
7505c6e0 2089 result[list_len] = NULL;
c829a427 2090
7505c6e0 2091 /* The list might have been modified in another thread, so
c829a427
MV
2092 we check LIST before each access.
2093 */
7505c6e0 2094 for (i = 0; i < list_len && scm_is_pair (list); i++)
c829a427 2095 {
7505c6e0
MW
2096 SCM str = SCM_CAR (list);
2097 size_t len; /* String length in bytes */
2098 char *c_str = scm_to_locale_stringn (str, &len);
2099
2100 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2101 scm_malloc to allocate the returned string, which must be
2102 explicitly deallocated. This forces us to copy the string a
2103 second time into a new buffer. Ideally there would be variants
2104 of scm_to_*_stringn that can return garbage-collected buffers. */
2105
2106 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2107 memcpy (result[i], c_str, len);
2a776823 2108 result[i][len] = '\0';
7505c6e0 2109 free (c_str);
2a776823 2110
c829a427
MV
2111 list = SCM_CDR (list);
2112 }
2113
c829a427 2114 return result;
af68e5e5 2115}
2a776823 2116#undef FUNC_NAME
24933780 2117
6f14f578
MV
2118void
2119scm_i_get_substring_spec (size_t len,
2120 SCM start, size_t *cstart,
2121 SCM end, size_t *cend)
2122{
2123 if (SCM_UNBNDP (start))
2124 *cstart = 0;
2125 else
2126 *cstart = scm_to_unsigned_integer (start, 0, len);
2127
2128 if (SCM_UNBNDP (end))
2129 *cend = len;
2130 else
2131 *cend = scm_to_unsigned_integer (end, *cstart, len);
2132}
2133
3ee86942
MV
2134#if SCM_ENABLE_DEPRECATED
2135
556d75db
MV
2136/* When these definitions are removed, it becomes reasonable to use
2137 read-only strings for string literals. For that, change the reader
2138 to create string literals with scm_c_substring_read_only instead of
2139 with scm_c_substring_copy.
2140*/
2141
3ee86942 2142int
fe78c51a 2143scm_i_deprecated_stringp (SCM str)
3ee86942
MV
2144{
2145 scm_c_issue_deprecation_warning
2146 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2147
2616f0e0 2148 return scm_is_string (str);
3ee86942
MV
2149}
2150
2151char *
fe78c51a 2152scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2153{
2154 char *chars;
2155
2156 scm_c_issue_deprecation_warning
2157 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2158
2616f0e0
MV
2159 /* We don't accept shared substrings here since they are not
2160 null-terminated.
2161 */
2162 if (IS_SH_STRING (str))
c291b588
LC
2163 scm_misc_error (NULL,
2164 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2165 SCM_EOL);
2166
877f06c3 2167 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2168 error message.
2169 */
2170
2171 if (IS_RO_STRING (str))
c291b588
LC
2172 scm_misc_error (NULL,
2173 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2174 SCM_EOL);
c291b588 2175
2616f0e0 2176 /* The following is still wrong, of course...
3ee86942 2177 */
9c44cd45 2178 str = scm_i_string_start_writing (str);
3ee86942
MV
2179 chars = scm_i_string_writable_chars (str);
2180 scm_i_string_stop_writing ();
2181 return chars;
2182}
2183
2184size_t
fe78c51a 2185scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2186{
2187 scm_c_issue_deprecation_warning
2188 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2189 return scm_c_string_length (str);
2190}
2191
2192#endif
2193
2a610be5
AW
2194static SCM
2195string_handle_ref (scm_t_array_handle *h, size_t index)
2196{
2197 return scm_c_string_ref (h->array, index);
2198}
2199
2200static void
2201string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2202{
2203 scm_c_string_set_x (h->array, index, val);
2204}
2205
2206static void
2207string_get_handle (SCM v, scm_t_array_handle *h)
2208{
2209 h->array = v;
2210 h->ndims = 1;
2211 h->dims = &h->dim0;
2212 h->dim0.lbnd = 0;
2213 h->dim0.ubnd = scm_c_string_length (v) - 1;
2214 h->dim0.inc = 1;
2215 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2216 h->elements = h->writable_elements = NULL;
2217}
2218
c5f17102 2219SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2220 string_handle_ref, string_handle_set,
f65e0168
LC
2221 string_get_handle)
2222SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2223
0f2d19dd
JB
2224void
2225scm_init_strings ()
0f2d19dd 2226{
a7e392c1 2227 scm_nullstr = scm_i_make_string (0, NULL, 0);
7c33806a 2228
a0599745 2229#include "libguile/strings.x"
0f2d19dd
JB
2230}
2231
89e00824
ML
2232
2233/*
2234 Local Variables:
2235 c-file-style: "gnu"
2236 End:
2237*/