reword open-file docs
[bpt/guile.git] / libguile / strings.c
CommitLineData
d40e1ca8 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
faf2c9d7 32
889975e5
MG
33#include "striconveh.h"
34
a0599745
MD
35#include "libguile/_scm.h"
36#include "libguile/chars.h"
7c33806a 37#include "libguile/root.h"
a0599745 38#include "libguile/strings.h"
a3d7d5d5 39#include "libguile/error.h"
f45eccff 40#include "libguile/generalized-vectors.h"
1afff620 41#include "libguile/deprecation.h"
a0599745 42#include "libguile/validate.h"
d31b9519 43#include "libguile/private-options.h"
1afff620 44
0f2d19dd
JB
45\f
46
47/* {Strings}
48 */
49
3ee86942
MV
50
51/* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 55 * shared, not whether it definitely is.
3ee86942
MV
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
50b1996f 68 *
ba54a202
LC
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
3ee86942
MV
72 */
73
ba54a202
LC
74/* The size in words of the stringbuf header (type tag + size). */
75#define STRINGBUF_HEADER_SIZE 2U
76
77#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
35920c00 79#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 80#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
81
82#define STRINGBUF_TAG scm_tc7_stringbuf
83#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 84#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 85
100e20c7 86#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
100e20c7
LC
89#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 91
100e20c7 92#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 93
9b41542f
LC
94#define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
3ee86942 103
56a3dcd4 104#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
105static size_t lenhist[1001];
106#endif
107
50b1996f
MG
108/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
3ee86942
MV
110static SCM
111make_stringbuf (size_t len)
0f2d19dd 112{
3ee86942
MV
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
ba54a202
LC
120 SCM buf;
121
56a3dcd4 122#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127#endif
0f2d19dd 128
ba54a202
LC
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
3ee86942 138}
e53cc817 139
50b1996f
MG
140/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
9c44cd45
MG
142static SCM
143make_wide_stringbuf (size_t len)
144{
ba54a202
LC
145 SCM buf;
146 size_t raw_len;
147
56a3dcd4 148#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153#endif
154
ba54a202
LC
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
9c44cd45
MG
165}
166
ba54a202
LC
167/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169static SCM
170wide_stringbuf (SCM buf)
9c44cd45 171{
ba54a202 172 SCM new_buf;
9c44cd45
MG
173
174 if (STRINGBUF_WIDE (buf))
ba54a202
LC
175 new_buf = buf;
176 else
9c44cd45 177 {
ba54a202
LC
178 size_t i, len;
179 scm_t_wchar *mem;
9c44cd45 180
ba54a202 181 len = STRINGBUF_LENGTH (buf);
9c44cd45 182
ba54a202 183 new_buf = make_wide_stringbuf (len);
9c44cd45 184
ba54a202 185 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 186 for (i = 0; i < len; i++)
ba54a202 187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 188 mem[len] = 0;
9c44cd45 189 }
ba54a202
LC
190
191 return new_buf;
3ee86942 192}
bd9e24b3 193
ba54a202
LC
194/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196static SCM
587a3355
MG
197narrow_stringbuf (SCM buf)
198{
ba54a202 199 SCM new_buf;
587a3355
MG
200
201 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
587a3355 208
ba54a202
LC
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 211
ba54a202
LC
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
587a3355 216
ba54a202 217 new_buf = make_stringbuf (len);
587a3355 218
ba54a202
LC
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
587a3355
MG
226}
227
9de87eea 228scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 229
ba54a202 230\f
3ee86942
MV
231/* Copy-on-write strings.
232 */
bd9e24b3 233
3ee86942 234#define STRING_TAG scm_tc7_string
bd9e24b3 235
3ee86942
MV
236#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 239
3ee86942
MV
240#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
ed35de72
MV
245/* Read-only strings.
246 */
247
35920c00 248#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
249#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
e1b29f6a
MV
251/* Mutation-sharing substrings
252 */
253
254#define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257/* START and LENGTH as for STRINGs. */
258
259#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
e7efe8e7
AW
261SCM scm_nullstr;
262
50b1996f
MG
263/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
3ee86942 267SCM
190d4b0d 268scm_i_make_string (size_t len, char **charsp, int read_only_p)
3ee86942
MV
269{
270 SCM buf = make_stringbuf (len);
271 SCM res;
272 if (charsp)
f59cf998 273 *charsp = (char *) STRINGBUF_CHARS (buf);
190d4b0d
LC
274 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
275 SCM_UNPACK (buf),
276 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 277 return res;
0f2d19dd
JB
278}
279
50b1996f
MG
280/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
9c44cd45 284SCM
190d4b0d 285scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
9c44cd45
MG
286{
287 SCM buf = make_wide_stringbuf (len);
288 SCM res;
289 if (charsp)
290 *charsp = STRINGBUF_WIDE_CHARS (buf);
190d4b0d
LC
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
9c44cd45
MG
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295}
296
3ee86942
MV
297static void
298validate_substring_args (SCM str, size_t start, size_t end)
299{
300 if (!IS_STRING (str))
301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
302 if (start > STRING_LENGTH (str))
303 scm_out_of_range (NULL, scm_from_size_t (start));
304 if (end > STRING_LENGTH (str) || end < start)
305 scm_out_of_range (NULL, scm_from_size_t (end));
306}
0f2d19dd 307
e1b29f6a
MV
308static inline void
309get_str_buf_start (SCM *str, SCM *buf, size_t *start)
310{
311 *start = STRING_START (*str);
312 if (IS_SH_STRING (*str))
313 {
314 *str = SH_STRING_STRING (*str);
315 *start += STRING_START (*str);
316 }
317 *buf = STRING_STRINGBUF (*str);
318}
319
3ee86942
MV
320SCM
321scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 322{
e1b29f6a
MV
323 SCM buf;
324 size_t str_start;
325 get_str_buf_start (&str, &buf, &str_start);
9de87eea 326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 327 SET_STRINGBUF_SHARED (buf);
9de87eea 328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 329 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
330 (scm_t_bits)str_start + start,
331 (scm_t_bits) end - start);
0f2d19dd
JB
332}
333
ed35de72
MV
334SCM
335scm_i_substring_read_only (SCM str, size_t start, size_t end)
336{
45a9f430
LC
337 SCM buf;
338 size_t str_start;
339 get_str_buf_start (&str, &buf, &str_start);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
341 SET_STRINGBUF_SHARED (buf);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
343 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
344 (scm_t_bits)str_start + start,
345 (scm_t_bits) end - start);
ed35de72
MV
346}
347
3ee86942
MV
348SCM
349scm_i_substring_copy (SCM str, size_t start, size_t end)
350{
351 size_t len = end - start;
edea856c 352 SCM buf, my_buf;
e1b29f6a
MV
353 size_t str_start;
354 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
355 if (scm_i_is_narrow_string (str))
356 {
357 my_buf = make_stringbuf (len);
358 memcpy (STRINGBUF_CHARS (my_buf),
359 STRINGBUF_CHARS (buf) + str_start + start, len);
360 }
361 else
362 {
363 my_buf = make_wide_stringbuf (len);
364 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
365 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
366 + start), len);
367 /* Even though this string is wide, the substring may be narrow.
50b1996f 368 Consider adding code to narrow the string. */
9c44cd45 369 }
3ee86942 370 scm_remember_upto_here_1 (buf);
9c44cd45
MG
371 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
372 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 373}
0f2d19dd 374
e1b29f6a
MV
375SCM
376scm_i_substring_shared (SCM str, size_t start, size_t end)
377{
378 if (start == 0 && end == STRING_LENGTH (str))
379 return str;
380 else
381 {
382 size_t len = end - start;
383 if (IS_SH_STRING (str))
384 {
385 start += STRING_START (str);
386 str = SH_STRING_STRING (str);
387 }
388 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
389 (scm_t_bits)start, (scm_t_bits) len);
390 }
391}
392
3ee86942
MV
393SCM
394scm_c_substring (SCM str, size_t start, size_t end)
395{
396 validate_substring_args (str, start, end);
397 return scm_i_substring (str, start, end);
398}
ee149d03 399
ed35de72
MV
400SCM
401scm_c_substring_read_only (SCM str, size_t start, size_t end)
402{
403 validate_substring_args (str, start, end);
404 return scm_i_substring_read_only (str, start, end);
405}
406
0f2d19dd 407SCM
3ee86942 408scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 409{
3ee86942
MV
410 validate_substring_args (str, start, end);
411 return scm_i_substring_copy (str, start, end);
412}
413
3ee86942
MV
414SCM
415scm_c_substring_shared (SCM str, size_t start, size_t end)
416{
417 validate_substring_args (str, start, end);
418 return scm_i_substring_shared (str, start, end);
419}
0f2d19dd 420
d6c74168 421\f
3ee86942
MV
422/* Internal accessors
423 */
424
50b1996f
MG
425/* Returns the number of characters in STR. This may be different
426 than the memory size of the string storage. */
3ee86942
MV
427size_t
428scm_i_string_length (SCM str)
0f2d19dd 429{
3ee86942 430 return STRING_LENGTH (str);
0f2d19dd
JB
431}
432
50b1996f
MG
433/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
434 encoding. False if it is 'wide', having a 32-bit UCS-4
435 encoding. */
9c44cd45
MG
436int
437scm_i_is_narrow_string (SCM str)
438{
439 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
440}
441
587a3355
MG
442/* Try to coerce a string to be narrow. It if is narrow already, do
443 nothing. If it is wide, shrink it to narrow if none of its
444 characters are above 0xFF. Return true if the string is narrow or
445 was made to be narrow. */
446int
447scm_i_try_narrow_string (SCM str)
448{
ba54a202 449 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
450
451 return scm_i_is_narrow_string (str);
452}
453
100e20c7
LC
454/* Return a pointer to the raw data of the string, which can be either Latin-1
455 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
456const void *
457scm_i_string_data (SCM str)
458{
459 SCM buf;
460 size_t start;
461 const char *data;
462
463 get_str_buf_start (&str, &buf, &start);
464
465 data = STRINGBUF_CONTENTS (buf);
466 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
467
468 return data;
469}
470
50b1996f
MG
471/* Returns a pointer to the 8-bit Latin-1 encoded character array of
472 STR. */
3ee86942
MV
473const char *
474scm_i_string_chars (SCM str)
475{
476 SCM buf;
e1b29f6a
MV
477 size_t start;
478 get_str_buf_start (&str, &buf, &start);
9c44cd45 479 if (scm_i_is_narrow_string (str))
f59cf998 480 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
481 else
482 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
483 scm_list_1 (str));
484 return NULL;
3ee86942 485}
b00418df 486
50b1996f
MG
487/* Returns a pointer to the 32-bit UCS-4 encoded character array of
488 STR. */
9c44cd45
MG
489const scm_t_wchar *
490scm_i_string_wide_chars (SCM str)
491{
492 SCM buf;
493 size_t start;
494
495 get_str_buf_start (&str, &buf, &start);
496 if (!scm_i_is_narrow_string (str))
f59cf998 497 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
498 else
499 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
500 scm_list_1 (str));
501}
502
503/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
504 a new string buffer, so that it can be modified without modifying
50b1996f
MG
505 other strings. Also, lock the string mutex. Later, one must call
506 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
507SCM
508scm_i_string_start_writing (SCM orig_str)
b00418df 509{
ed35de72 510 SCM buf, str = orig_str;
e1b29f6a 511 size_t start;
ed35de72 512
e1b29f6a 513 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
514 if (IS_RO_STRING (str))
515 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
516
9de87eea 517 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
518 if (STRINGBUF_SHARED (buf))
519 {
9c44cd45 520 /* Clone the stringbuf. */
3ee86942
MV
521 size_t len = STRING_LENGTH (str);
522 SCM new_buf;
523
9de87eea 524 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 525
9c44cd45
MG
526 if (scm_i_is_narrow_string (str))
527 {
528 new_buf = make_stringbuf (len);
529 memcpy (STRINGBUF_CHARS (new_buf),
530 STRINGBUF_CHARS (buf) + STRING_START (str), len);
531
532 }
533 else
534 {
535 new_buf = make_wide_stringbuf (len);
536 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
537 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
538 + STRING_START (str)), len);
539 }
3ee86942 540
3ee86942
MV
541 SET_STRING_STRINGBUF (str, new_buf);
542 start -= STRING_START (str);
902578f1
LC
543
544 /* FIXME: The following operations are not atomic, so other threads
545 looking at STR may see an inconsistent state. Nevertheless it can't
546 hurt much since (i) accessing STR while it is being mutated can't
547 yield a crash, and (ii) concurrent accesses to STR should be
548 protected by a mutex at the application level. The latter may not
549 apply when STR != ORIG_STR, though. */
3ee86942 550 SET_STRING_START (str, 0);
902578f1 551 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
552
553 buf = new_buf;
554
9de87eea 555 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 556 }
9c44cd45
MG
557 return orig_str;
558}
3ee86942 559
50b1996f 560/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
561char *
562scm_i_string_writable_chars (SCM str)
563{
564 SCM buf;
565 size_t start;
3ee86942 566
9c44cd45
MG
567 get_str_buf_start (&str, &buf, &start);
568 if (scm_i_is_narrow_string (str))
f59cf998 569 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
570 else
571 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
572 scm_list_1 (str));
573 return NULL;
b00418df
DH
574}
575
50b1996f 576/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
577static scm_t_wchar *
578scm_i_string_writable_wide_chars (SCM str)
579{
580 SCM buf;
581 size_t start;
582
583 get_str_buf_start (&str, &buf, &start);
584 if (!scm_i_is_narrow_string (str))
585 return STRINGBUF_WIDE_CHARS (buf) + start;
586 else
1c7b216f 587 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 588 scm_list_1 (str));
b00418df
DH
589}
590
50b1996f
MG
591/* Unlock the string mutex that was locked when
592 scm_i_string_start_writing was called. */
3ee86942
MV
593void
594scm_i_string_stop_writing (void)
595{
9de87eea 596 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 597}
b00418df 598
50b1996f 599/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
600scm_t_wchar
601scm_i_string_ref (SCM str, size_t x)
602{
603 if (scm_i_is_narrow_string (str))
604 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
605 else
606 return scm_i_string_wide_chars (str)[x];
607}
608
889975e5
MG
609/* Returns index+1 of the first char in STR that matches C, or
610 0 if the char is not found. */
611int
612scm_i_string_contains_char (SCM str, char ch)
613{
614 size_t i;
615 size_t len = scm_i_string_length (str);
616
617 i = 0;
618 if (scm_i_is_narrow_string (str))
619 {
620 while (i < len)
621 {
622 if (scm_i_string_chars (str)[i] == ch)
623 return i+1;
624 i++;
625 }
626 }
627 else
628 {
629 while (i < len)
630 {
631 if (scm_i_string_wide_chars (str)[i]
632 == (unsigned char) ch)
633 return i+1;
634 i++;
635 }
636 }
637 return 0;
638}
639
3f47e526
MG
640int
641scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
642{
643 if (scm_i_is_narrow_string (sstr))
644 {
645 const char *a = scm_i_string_chars (sstr) + start_x;
646 const char *b = cstr;
647 return strncmp (a, b, strlen(b));
648 }
649 else
650 {
651 size_t i;
652 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
653 const char *b = cstr;
654 for (i = 0; i < strlen (b); i++)
655 {
656 if (a[i] != (unsigned char) b[i])
657 return 1;
658 }
659 }
660 return 0;
661}
662
50b1996f 663/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
664void
665scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
666{
667 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 668 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
669
670 if (scm_i_is_narrow_string (str))
671 {
672 char *dst = scm_i_string_writable_chars (str);
587a3355 673 dst[p] = chr;
9c44cd45
MG
674 }
675 else
676 {
677 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
678 dst[p] = chr;
679 }
680}
681
ba54a202 682\f
3ee86942 683/* Symbols.
587a3355 684
3ee86942
MV
685 Basic symbol creation and accessing is done here, the rest is in
686 symbols.[hc]. This has been done to keep stringbufs and the
687 internals of strings and string-like objects confined to this file.
688*/
689
690#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
691
692SCM
6869328b
MV
693scm_i_make_symbol (SCM name, scm_t_bits flags,
694 unsigned long hash, SCM props)
3ee86942
MV
695{
696 SCM buf;
697 size_t start = STRING_START (name);
698 size_t length = STRING_LENGTH (name);
699
700 if (IS_SH_STRING (name))
701 {
702 name = SH_STRING_STRING (name);
703 start += STRING_START (name);
704 }
705 buf = SYMBOL_STRINGBUF (name);
706
707 if (start == 0 && length == STRINGBUF_LENGTH (buf))
708 {
709 /* reuse buf. */
9de87eea 710 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 711 SET_STRINGBUF_SHARED (buf);
9de87eea 712 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
713 }
714 else
715 {
716 /* make new buf. */
9c44cd45
MG
717 if (scm_i_is_narrow_string (name))
718 {
719 SCM new_buf = make_stringbuf (length);
720 memcpy (STRINGBUF_CHARS (new_buf),
721 STRINGBUF_CHARS (buf) + start, length);
722 buf = new_buf;
723 }
724 else
725 {
726 SCM new_buf = make_wide_stringbuf (length);
727 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
728 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
729 length);
730 buf = new_buf;
731 }
3ee86942 732 }
6869328b 733 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
734 (scm_t_bits) hash, SCM_UNPACK (props));
735}
736
fd0a5bbc
HWN
737SCM
738scm_i_c_make_symbol (const char *name, size_t len,
739 scm_t_bits flags, unsigned long hash, SCM props)
740{
741 SCM buf = make_stringbuf (len);
742 memcpy (STRINGBUF_CHARS (buf), name, len);
743
65619ebe
AW
744 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
745 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
746}
747
50b1996f
MG
748/* Returns the number of characters in SYM. This may be different
749 from the memory size of SYM. */
3ee86942
MV
750size_t
751scm_i_symbol_length (SCM sym)
0f2d19dd 752{
3ee86942 753 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
754}
755
071bb6a8
LC
756size_t
757scm_c_symbol_length (SCM sym)
758#define FUNC_NAME "scm_c_symbol_length"
759{
760 SCM_VALIDATE_SYMBOL (1, sym);
761
762 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
763}
764#undef FUNC_NAME
765
50b1996f
MG
766/* True if the name of SYM is stored as a Latin-1 encoded string.
767 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
768int
769scm_i_is_narrow_symbol (SCM sym)
770{
771 SCM buf;
772
773 buf = SYMBOL_STRINGBUF (sym);
774 return !STRINGBUF_WIDE (buf);
775}
776
50b1996f
MG
777/* Returns a pointer to the 8-bit Latin-1 encoded character array that
778 contains the name of SYM. */
3ee86942
MV
779const char *
780scm_i_symbol_chars (SCM sym)
781{
9c44cd45
MG
782 SCM buf;
783
784 buf = SYMBOL_STRINGBUF (sym);
785 if (!STRINGBUF_WIDE (buf))
f59cf998 786 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
787 else
788 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
789 scm_list_1 (sym));
790}
791
50b1996f
MG
792/* Return a pointer to the 32-bit UCS-4-encoded character array of a
793 symbol's name. */
9c44cd45
MG
794const scm_t_wchar *
795scm_i_symbol_wide_chars (SCM sym)
796{
797 SCM buf;
798
799 buf = SYMBOL_STRINGBUF (sym);
800 if (STRINGBUF_WIDE (buf))
f59cf998 801 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
802 else
803 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
804 scm_list_1 (sym));
3ee86942 805}
1cc91f1b 806
be54b15d 807SCM
3ee86942 808scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 809{
3ee86942 810 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 811 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 812 SET_STRINGBUF_SHARED (buf);
9de87eea 813 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 814 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
815 (scm_t_bits)start, (scm_t_bits) end - start);
816}
be54b15d 817
50b1996f 818/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
819scm_t_wchar
820scm_i_symbol_ref (SCM sym, size_t x)
821{
822 if (scm_i_is_narrow_symbol (sym))
823 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
824 else
825 return scm_i_symbol_wide_chars (sym)[x];
826}
827
3ee86942
MV
828/* Debugging
829 */
be54b15d 830
6ce6923b
MG
831SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
832 "Returns an association list containing debugging information\n"
833 "for @var{str}. The association list has the following entries."
834 "@table @code\n"
835 "@item string\n"
836 "The string itself.\n"
837 "@item start\n"
838 "The start index of the string into its stringbuf\n"
839 "@item length\n"
840 "The length of the string\n"
841 "@item shared\n"
842 "If this string is a substring, it returns its parent string.\n"
843 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
844 "@item read-only\n"
845 "@code{#t} if the string is read-only\n"
6ce6923b
MG
846 "@item stringbuf-chars\n"
847 "A new string containing this string's stringbuf's characters\n"
848 "@item stringbuf-length\n"
849 "The number of characters in this stringbuf\n"
850 "@item stringbuf-shared\n"
851 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
852 "@item stringbuf-wide\n"
853 "@code{#t} if this stringbuf's characters are stored in a\n"
854 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
855 "buffer\n"
856 "@end table")
3ee86942
MV
857#define FUNC_NAME s_scm_sys_string_dump
858{
ba54a202 859 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 860 SCM buf;
3ee86942 861 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
862
863 /* String info */
4a655e50 864 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 865 str);
4a655e50 866 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 867 scm_from_size_t (STRING_START (str)));
4a655e50 868 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
869 scm_from_size_t (STRING_LENGTH (str)));
870
3ee86942
MV
871 if (IS_SH_STRING (str))
872 {
4a655e50 873 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
874 SH_STRING_STRING (str));
875 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
876 }
877 else
878 {
4a655e50 879 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
880 SCM_BOOL_F);
881 buf = STRING_STRINGBUF (str);
3ee86942 882 }
9c44cd45 883
88ed5759 884 if (IS_RO_STRING (str))
4a655e50 885 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
886 SCM_BOOL_T);
887 else
4a655e50 888 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 889 SCM_BOOL_F);
587a3355 890
6ce6923b 891 /* Stringbuf info */
6ce6923b
MG
892 if (!STRINGBUF_WIDE (buf))
893 {
894 size_t len = STRINGBUF_LENGTH (buf);
895 char *cbuf;
190d4b0d 896 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 897 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 898 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 899 sbc);
3ee86942 900 }
6ce6923b
MG
901 else
902 {
903 size_t len = STRINGBUF_LENGTH (buf);
904 scm_t_wchar *cbuf;
190d4b0d 905 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
906 u32_cpy ((scm_t_uint32 *) cbuf,
907 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 908 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
909 sbc);
910 }
4a655e50 911 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
912 scm_from_size_t (STRINGBUF_LENGTH (buf)));
913 if (STRINGBUF_SHARED (buf))
4a655e50 914 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
915 SCM_BOOL_T);
916 else
4a655e50 917 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 918 SCM_BOOL_F);
6ce6923b 919 if (STRINGBUF_WIDE (buf))
4a655e50 920 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 921 SCM_BOOL_T);
6ce6923b 922 else
4a655e50 923 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 924 SCM_BOOL_F);
6ce6923b 925
ba54a202 926 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
927}
928#undef FUNC_NAME
929
6ce6923b
MG
930SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
931 "Returns an association list containing debugging information\n"
932 "for @var{sym}. The association list has the following entries."
933 "@table @code\n"
934 "@item symbol\n"
935 "The symbol itself\n"
936 "@item hash\n"
937 "Its hash value\n"
88ed5759
MG
938 "@item interned\n"
939 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
940 "@item stringbuf-chars\n"
941 "A new string containing this symbols's stringbuf's characters\n"
942 "@item stringbuf-length\n"
943 "The number of characters in this stringbuf\n"
944 "@item stringbuf-shared\n"
945 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
946 "@item stringbuf-wide\n"
947 "@code{#t} if this stringbuf's characters are stored in a\n"
948 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
949 "buffer\n"
950 "@end table")
3ee86942
MV
951#define FUNC_NAME s_scm_sys_symbol_dump
952{
ba54a202 953 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 954 SCM buf;
3ee86942 955 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 956 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 957 sym);
4a655e50 958 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 959 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 960 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 961 scm_symbol_interned_p (sym));
6ce6923b
MG
962 buf = SYMBOL_STRINGBUF (sym);
963
964 /* Stringbuf info */
6ce6923b
MG
965 if (!STRINGBUF_WIDE (buf))
966 {
967 size_t len = STRINGBUF_LENGTH (buf);
968 char *cbuf;
190d4b0d 969 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 970 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 971 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
972 sbc);
973 }
9c44cd45 974 else
6ce6923b
MG
975 {
976 size_t len = STRINGBUF_LENGTH (buf);
977 scm_t_wchar *cbuf;
190d4b0d 978 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
979 u32_cpy ((scm_t_uint32 *) cbuf,
980 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 981 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
982 sbc);
983 }
4a655e50 984 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
985 scm_from_size_t (STRINGBUF_LENGTH (buf)));
986 if (STRINGBUF_SHARED (buf))
4a655e50 987 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
988 SCM_BOOL_T);
989 else
4a655e50 990 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 991 SCM_BOOL_F);
6ce6923b 992 if (STRINGBUF_WIDE (buf))
4a655e50 993 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
994 SCM_BOOL_T);
995 else
4a655e50 996 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 997 SCM_BOOL_F);
ba54a202 998 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 999
3ee86942
MV
1000}
1001#undef FUNC_NAME
1002
56a3dcd4 1003#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1004
9c44cd45 1005SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1006#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1007{
1008 int i;
1009 for (i = 0; i < 1000; i++)
1010 if (lenhist[i])
1011 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1012 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1013 return SCM_UNSPECIFIED;
be54b15d
DH
1014}
1015#undef FUNC_NAME
1016
3ee86942
MV
1017#endif
1018
1019\f
1020
1021SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1022 (SCM obj),
1023 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1024#define FUNC_NAME s_scm_string_p
1025{
1026 return scm_from_bool (IS_STRING (obj));
1027}
1028#undef FUNC_NAME
1029
1030
1031SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1032
1033SCM_DEFINE (scm_string, "string", 0, 0, 1,
1034 (SCM chrs),
1035 "@deffnx {Scheme Procedure} list->string chrs\n"
1036 "Return a newly allocated string composed of the arguments,\n"
1037 "@var{chrs}.")
1038#define FUNC_NAME s_scm_string
1039{
9aa27c1a 1040 SCM result = SCM_BOOL_F;
9c44cd45 1041 SCM rest;
3ee86942 1042 size_t len;
9c44cd45
MG
1043 size_t p = 0;
1044 long i;
9aa27c1a 1045 int wide = 0;
3ee86942 1046
9c44cd45
MG
1047 /* Verify that this is a list of chars. */
1048 i = scm_ilength (chrs);
3c7cf7f5 1049 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1050
9c44cd45
MG
1051 len = (size_t) i;
1052 rest = chrs;
3ee86942 1053
9c44cd45 1054 while (len > 0 && scm_is_pair (rest))
3ee86942 1055 {
9c44cd45 1056 SCM elt = SCM_CAR (rest);
3ee86942 1057 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1058 if (SCM_CHAR (elt) > 0xFF)
1059 wide = 1;
9c44cd45
MG
1060 rest = SCM_CDR (rest);
1061 len--;
1062 scm_remember_upto_here_1 (elt);
1063 }
1064
1065 /* Construct a string containing this list of chars. */
1066 len = (size_t) i;
1067 rest = chrs;
1068
9aa27c1a 1069 if (wide == 0)
9c44cd45 1070 {
56a3dcd4
LC
1071 char *buf;
1072
190d4b0d 1073 result = scm_i_make_string (len, NULL, 0);
9aa27c1a 1074 result = scm_i_string_start_writing (result);
56a3dcd4 1075 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1076 while (len > 0 && scm_is_pair (rest))
1077 {
1078 SCM elt = SCM_CAR (rest);
1079 buf[p] = (unsigned char) SCM_CHAR (elt);
1080 p++;
1081 rest = SCM_CDR (rest);
1082 len--;
1083 scm_remember_upto_here_1 (elt);
1084 }
1085 }
1086 else
1087 {
56a3dcd4
LC
1088 scm_t_wchar *buf;
1089
190d4b0d 1090 result = scm_i_make_wide_string (len, NULL, 0);
9aa27c1a 1091 result = scm_i_string_start_writing (result);
56a3dcd4 1092 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1093 while (len > 0 && scm_is_pair (rest))
1094 {
1095 SCM elt = SCM_CAR (rest);
1096 buf[p] = SCM_CHAR (elt);
1097 p++;
1098 rest = SCM_CDR (rest);
1099 len--;
1100 scm_remember_upto_here_1 (elt);
1101 }
3ee86942 1102 }
9c44cd45
MG
1103 scm_i_string_stop_writing ();
1104
3ee86942
MV
1105 if (len > 0)
1106 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1107 if (!scm_is_null (rest))
3ee86942
MV
1108 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1109
1110 return result;
1111}
1112#undef FUNC_NAME
be54b15d 1113
3b3b36dd 1114SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1115 (SCM k, SCM chr),
0d26a824
MG
1116 "Return a newly allocated string of\n"
1117 "length @var{k}. If @var{chr} is given, then all elements of\n"
1118 "the string are initialized to @var{chr}, otherwise the contents\n"
3ef6650d 1119 "of the @var{string} are all set to @var{#\nul}.")
1bbd0b84 1120#define FUNC_NAME s_scm_make_string
0f2d19dd 1121{
3ee86942
MV
1122 return scm_c_make_string (scm_to_size_t (k), chr);
1123}
1124#undef FUNC_NAME
1125
1126SCM
1127scm_c_make_string (size_t len, SCM chr)
1128#define FUNC_NAME NULL
1129{
9c44cd45 1130 size_t p;
3ef6650d 1131 char *contents = NULL;
190d4b0d 1132 SCM res = scm_i_make_string (len, &contents, 0);
cb0d8be2 1133
3ef6650d
AW
1134 /* If no char is given, initialize string contents to NULL. */
1135 if (SCM_UNBNDP (chr))
1136 memset (contents, 0, len);
1137 else
e11e83f3 1138 {
3ee86942 1139 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1140 res = scm_i_string_start_writing (res);
1141 for (p = 0; p < len; p++)
1142 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1143 scm_i_string_stop_writing ();
0f2d19dd 1144 }
e11e83f3
MV
1145
1146 return res;
0f2d19dd 1147}
1bbd0b84 1148#undef FUNC_NAME
0f2d19dd 1149
3b3b36dd 1150SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1151 (SCM string),
1152 "Return the number of characters in @var{string}.")
1bbd0b84 1153#define FUNC_NAME s_scm_string_length
0f2d19dd 1154{
d1ca2c64 1155 SCM_VALIDATE_STRING (1, string);
3ee86942 1156 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1157}
1bbd0b84 1158#undef FUNC_NAME
0f2d19dd 1159
f8ba2bb9 1160SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1161 (SCM string),
1162 "Return the bytes used to represent a character in @var{string}."
1163 "This will return 1 or 4.")
f8ba2bb9 1164#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1165{
1166 SCM_VALIDATE_STRING (1, string);
1167 if (!scm_i_is_narrow_string (string))
1168 return scm_from_int (4);
1169
1170 return scm_from_int (1);
1171}
1172#undef FUNC_NAME
1173
3ee86942
MV
1174size_t
1175scm_c_string_length (SCM string)
1176{
1177 if (!IS_STRING (string))
1178 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1179 return STRING_LENGTH (string);
1180}
1181
bd9e24b3 1182SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1183 (SCM str, SCM k),
9c44cd45
MG
1184 "Return character @var{k} of @var{str} using zero-origin\n"
1185 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1186#define FUNC_NAME s_scm_string_ref
0f2d19dd 1187{
3ae3166b 1188 size_t len;
a55c2b68 1189 unsigned long idx;
bd9e24b3 1190
d1ca2c64 1191 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1192
1193 len = scm_i_string_length (str);
1194 if (SCM_LIKELY (len > 0))
1195 idx = scm_to_unsigned_integer (k, 0, len - 1);
1196 else
1197 scm_out_of_range (NULL, k);
1198
9c44cd45
MG
1199 if (scm_i_is_narrow_string (str))
1200 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1201 else
1202 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1203}
1bbd0b84 1204#undef FUNC_NAME
0f2d19dd 1205
3ee86942
MV
1206SCM
1207scm_c_string_ref (SCM str, size_t p)
1208{
1209 if (p >= scm_i_string_length (str))
1210 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1211 if (scm_i_is_narrow_string (str))
1212 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1213 else
1214 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1215
3ee86942 1216}
f0942910 1217
3b3b36dd 1218SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1219 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1220 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1221 "an unspecified value. @var{k} must be a valid index of\n"
1222 "@var{str}.")
1bbd0b84 1223#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1224{
3ae3166b 1225 size_t len;
a55c2b68
MV
1226 unsigned long idx;
1227
f0942910 1228 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1229
1230 len = scm_i_string_length (str);
1231 if (SCM_LIKELY (len > 0))
1232 idx = scm_to_unsigned_integer (k, 0, len - 1);
1233 else
1234 scm_out_of_range (NULL, k);
1235
34d19ef6 1236 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1237 str = scm_i_string_start_writing (str);
1238 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1239 scm_i_string_stop_writing ();
1240
0f2d19dd
JB
1241 return SCM_UNSPECIFIED;
1242}
1bbd0b84 1243#undef FUNC_NAME
0f2d19dd 1244
3ee86942
MV
1245void
1246scm_c_string_set_x (SCM str, size_t p, SCM chr)
1247{
1248 if (p >= scm_i_string_length (str))
1249 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1250 str = scm_i_string_start_writing (str);
1251 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1252 scm_i_string_stop_writing ();
3ee86942 1253}
0f2d19dd 1254
3b3b36dd 1255SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1256 (SCM str, SCM start, SCM end),
1257 "Return a newly allocated string formed from the characters\n"
1258 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1259 "ending with index @var{end} (exclusive).\n"
1260 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1261 "exact integers satisfying:\n\n"
1262 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1263#define FUNC_NAME s_scm_substring
0f2d19dd 1264{
3ee86942 1265 size_t len, from, to;
685c0d71 1266
d1ca2c64 1267 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1268 len = scm_i_string_length (str);
1269 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1270 if (SCM_UNBNDP (end))
3ee86942 1271 to = len;
a55c2b68 1272 else
3ee86942
MV
1273 to = scm_to_unsigned_integer (end, from, len);
1274 return scm_i_substring (str, from, to);
0f2d19dd 1275}
1bbd0b84 1276#undef FUNC_NAME
0f2d19dd 1277
ed35de72
MV
1278SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1279 (SCM str, SCM start, SCM end),
1280 "Return a newly allocated string formed from the characters\n"
1281 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1282 "ending with index @var{end} (exclusive).\n"
1283 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1284 "exact integers satisfying:\n"
1285 "\n"
1286 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1287 "\n"
1288 "The returned string is read-only.\n")
1289#define FUNC_NAME s_scm_substring_read_only
1290{
1291 size_t len, from, to;
1292
1293 SCM_VALIDATE_STRING (1, str);
1294 len = scm_i_string_length (str);
1295 from = scm_to_unsigned_integer (start, 0, len);
1296 if (SCM_UNBNDP (end))
1297 to = len;
1298 else
1299 to = scm_to_unsigned_integer (end, from, len);
1300 return scm_i_substring_read_only (str, from, to);
1301}
1302#undef FUNC_NAME
1303
3ee86942
MV
1304SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1305 (SCM str, SCM start, SCM end),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312#define FUNC_NAME s_scm_substring_copy
1313{
e1b29f6a
MV
1314 /* For the Scheme version, START is mandatory, but for the C
1315 version, it is optional. See scm_string_copy in srfi-13.c for a
1316 rationale.
1317 */
1318
1319 size_t from, to;
3ee86942
MV
1320
1321 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1322 scm_i_get_substring_spec (scm_i_string_length (str),
1323 start, &from, end, &to);
3ee86942
MV
1324 return scm_i_substring_copy (str, from, to);
1325}
1326#undef FUNC_NAME
1327
1328SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1329 (SCM str, SCM start, SCM end),
1330 "Return string that indirectly refers to the characters\n"
1331 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1332 "ending with index @var{end} (exclusive).\n"
1333 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1334 "exact integers satisfying:\n\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1336#define FUNC_NAME s_scm_substring_shared
1337{
1338 size_t len, from, to;
1339
1340 SCM_VALIDATE_STRING (1, str);
1341 len = scm_i_string_length (str);
1342 from = scm_to_unsigned_integer (start, 0, len);
1343 if (SCM_UNBNDP (end))
1344 to = len;
1345 else
1346 to = scm_to_unsigned_integer (end, from, len);
1347 return scm_i_substring_shared (str, from, to);
1348}
1349#undef FUNC_NAME
685c0d71 1350
3b3b36dd 1351SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1352 (SCM args),
9c44cd45 1353 "Return a newly allocated string whose characters form the\n"
0d26a824 1354 "concatenation of the given strings, @var{args}.")
1bbd0b84 1355#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1356{
1357 SCM res;
9c44cd45
MG
1358 size_t len = 0;
1359 int wide = 0;
c829a427 1360 SCM l, s;
bd4911ef 1361 size_t i;
9909c395
MG
1362 union
1363 {
1364 char *narrow;
1365 scm_t_wchar *wide;
1366 } data;
af45e3b0
DH
1367
1368 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1369 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1370 {
1371 s = SCM_CAR (l);
1372 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1373 len += scm_i_string_length (s);
1374 if (!scm_i_is_narrow_string (s))
1375 wide = 1;
c829a427 1376 }
9909c395 1377 data.narrow = NULL;
9c44cd45 1378 if (!wide)
190d4b0d 1379 res = scm_i_make_string (len, &data.narrow, 0);
9c44cd45 1380 else
190d4b0d 1381 res = scm_i_make_wide_string (len, &data.wide, 0);
9c44cd45
MG
1382
1383 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1384 {
edea856c 1385 size_t len;
c829a427 1386 s = SCM_CAR (l);
3ee86942 1387 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1388 len = scm_i_string_length (s);
9c44cd45
MG
1389 if (!wide)
1390 {
9909c395
MG
1391 memcpy (data.narrow, scm_i_string_chars (s), len);
1392 data.narrow += len;
9c44cd45
MG
1393 }
1394 else
1395 {
1396 if (scm_i_is_narrow_string (s))
1397 {
1398 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1399 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1400 }
1401 else
9909c395 1402 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1403 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1404 data.wide += len;
9c44cd45 1405 }
c829a427
MV
1406 scm_remember_upto_here_1 (s);
1407 }
0f2d19dd
JB
1408 return res;
1409}
1bbd0b84 1410#undef FUNC_NAME
0f2d19dd 1411
24933780 1412
a3d7d5d5 1413\f
c62da8f8 1414/* Charset conversion error handling. */
a3d7d5d5
LC
1415
1416SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
c62da8f8
LC
1417SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1418
6851d3be
LC
1419/* Raise an exception informing that character CHR could not be written
1420 to PORT in its current encoding. */
d14418a5 1421void
ef7e4ba3 1422scm_encoding_error (const char *subr, int err, const char *message,
6851d3be 1423 SCM port, SCM chr)
ef7e4ba3 1424{
ef7e4ba3 1425 scm_throw (scm_encoding_error_key,
d050ef66
AW
1426 scm_list_n (scm_from_latin1_string (subr),
1427 scm_from_latin1_string (message),
ef7e4ba3 1428 scm_from_int (err),
6851d3be 1429 port, chr,
ef7e4ba3 1430 SCM_UNDEFINED));
a3d7d5d5
LC
1431}
1432
c62da8f8
LC
1433/* Raise an exception informing of an encoding error on PORT. This
1434 means that a character could not be written in PORT's encoding. */
1435void
1436scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1437{
1438 scm_throw (scm_decoding_error_key,
d050ef66
AW
1439 scm_list_n (scm_from_latin1_string (subr),
1440 scm_from_latin1_string (message),
c62da8f8
LC
1441 scm_from_int (err),
1442 port,
1443 SCM_UNDEFINED));
1444}
1445
1446\f
1447/* String conversion to/from C. */
1448
fac32b51 1449SCM
587a3355
MG
1450scm_from_stringn (const char *str, size_t len, const char *encoding,
1451 scm_t_string_failed_conversion_handler handler)
1452{
1453 size_t u32len, i;
1454 scm_t_wchar *u32;
1455 int wide = 0;
1456 SCM res;
1457
d40e1ca8 1458 /* The order of these checks is important. */
a574564c 1459 if (!str && len != 0)
d40e1ca8
AW
1460 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1461 if (len == (size_t) -1)
1462 len = strlen (str);
a574564c
AW
1463 if (len == 0)
1464 return scm_nullstr;
fac32b51 1465
889975e5
MG
1466 if (encoding == NULL)
1467 {
1468 /* If encoding is null, use Latin-1. */
1469 char *buf;
190d4b0d 1470 res = scm_i_make_string (len, &buf, 0);
889975e5
MG
1471 memcpy (buf, str, len);
1472 return res;
1473 }
1474
587a3355
MG
1475 u32len = 0;
1476 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1477 (enum iconv_ilseq_handler)
1478 handler,
1479 str, len,
1480 NULL,
1481 NULL, &u32len);
1482
ef7e4ba3 1483 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1484 {
ef7e4ba3
LC
1485 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1486 handler. */
1487 SCM bv;
1488 signed char *buf;
1489
1490 buf = scm_gc_malloc_pointerless (len, "bytevector");
1491 memcpy (buf, str, len);
1492 bv = scm_c_take_bytevector (buf, len);
1493
c62da8f8
LC
1494 scm_decoding_error (__func__, errno,
1495 "input locale conversion error", bv);
587a3355
MG
1496 }
1497
1498 i = 0;
1499 while (i < u32len)
1500 if (u32[i++] > 0xFF)
1501 {
1502 wide = 1;
1503 break;
1504 }
1505
1506 if (!wide)
1507 {
1508 char *dst;
190d4b0d 1509 res = scm_i_make_string (u32len, &dst, 0);
587a3355
MG
1510 for (i = 0; i < u32len; i ++)
1511 dst[i] = (unsigned char) u32[i];
1512 dst[u32len] = '\0';
1513 }
1514 else
1515 {
1516 scm_t_wchar *wdst;
190d4b0d 1517 res = scm_i_make_wide_string (u32len, &wdst, 0);
587a3355
MG
1518 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1519 wdst[u32len] = 0;
1520 }
1521
1522 free (u32);
1523 return res;
1524}
1525
cf313a94 1526SCM
d40e1ca8 1527scm_from_locale_string (const char *str)
cf313a94 1528{
d40e1ca8 1529 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1530}
1531
c829a427
MV
1532SCM
1533scm_from_locale_stringn (const char *str, size_t len)
1534{
95f5e303
AW
1535 return scm_from_stringn (str, len, locale_charset (),
1536 scm_i_get_conversion_strategy (SCM_BOOL_F));
c829a427 1537}
4d4528e7 1538
c829a427 1539SCM
d40e1ca8 1540scm_from_latin1_string (const char *str)
4d4528e7 1541{
d40e1ca8
AW
1542 return scm_from_latin1_stringn (str, -1);
1543}
9c44cd45 1544
d40e1ca8
AW
1545SCM
1546scm_from_latin1_stringn (const char *str, size_t len)
1547{
e9a35a96
LC
1548 char *buf;
1549 SCM result;
1550
1551 if (len == (size_t) -1)
1552 len = strlen (str);
1553
1554 /* Make a narrow string and copy STR as is. */
190d4b0d 1555 result = scm_i_make_string (len, &buf, 0);
e9a35a96
LC
1556 memcpy (buf, str, len);
1557
1558 return result;
c829a427 1559}
4d4528e7 1560
587a3355 1561SCM
d40e1ca8 1562scm_from_utf8_string (const char *str)
587a3355 1563{
d40e1ca8
AW
1564 return scm_from_utf8_stringn (str, -1);
1565}
1566
1567SCM
1568scm_from_utf8_stringn (const char *str, size_t len)
1569{
1570 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
587a3355
MG
1571}
1572
647dc1ac
LC
1573SCM
1574scm_from_utf32_string (const scm_t_wchar *str)
1575{
1576 return scm_from_utf32_stringn (str, -1);
1577}
1578
1579SCM
1580scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1581{
1582 SCM result;
1583 scm_t_wchar *buf;
1584
1585 if (len == (size_t) -1)
1586 len = u32_strlen ((uint32_t *) str);
1587
190d4b0d 1588 result = scm_i_make_wide_string (len, &buf, 0);
647dc1ac
LC
1589 memcpy (buf, str, len * sizeof (scm_t_wchar));
1590 scm_i_try_narrow_string (result);
1591
1592 return result;
1593}
1594
50b1996f
MG
1595/* Create a new scheme string from the C string STR. The memory of
1596 STR may be used directly as storage for the new string. */
13a94556
LC
1597/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1598 would be to register a finalizer to eventually free(3) STR, which isn't
1599 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1600SCM
1601scm_take_locale_stringn (char *str, size_t len)
1602{
13a94556 1603 SCM res;
48ddf0d9 1604
13a94556
LC
1605 res = scm_from_locale_stringn (str, len);
1606 free (str);
c829a427 1607
c829a427
MV
1608 return res;
1609}
1610
48ddf0d9
KR
1611SCM
1612scm_take_locale_string (char *str)
1613{
1614 return scm_take_locale_stringn (str, -1);
1615}
1616
f1ee6d54
LC
1617/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1618 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1619 Set *LENP to the size of the resulting string.
1620
1621 FIXME: This is a hack we should get rid of. See
1622 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1623 for details. */
1624static void
1625unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1626{
1627 char *before, *after;
1628 size_t i, j;
1629
4ff2b9f4
LC
1630 before = buf;
1631 after = buf;
9c44cd45
MG
1632 i = 0;
1633 j = 0;
1634 while (i < *lenp)
1635 {
1636 if ((i <= *lenp - 6)
1637 && before[i] == '\\'
1638 && before[i + 1] == 'u'
1639 && before[i + 2] == '0' && before[i + 3] == '0')
1640 {
1641 /* Convert \u00NN to \xNN */
1642 after[j] = '\\';
1643 after[j + 1] = 'x';
30a6b9ca
MG
1644 after[j + 2] = tolower ((int) before[i + 4]);
1645 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1646 i += 6;
1647 j += 4;
1648 }
1649 else if ((i <= *lenp - 10)
1650 && before[i] == '\\'
1651 && before[i + 1] == 'U'
1652 && before[i + 2] == '0' && before[i + 3] == '0')
1653 {
1654 /* Convert \U00NNNNNN to \UNNNNNN */
1655 after[j] = '\\';
1656 after[j + 1] = 'U';
30a6b9ca
MG
1657 after[j + 2] = tolower ((int) before[i + 4]);
1658 after[j + 3] = tolower ((int) before[i + 5]);
1659 after[j + 4] = tolower ((int) before[i + 6]);
1660 after[j + 5] = tolower ((int) before[i + 7]);
1661 after[j + 6] = tolower ((int) before[i + 8]);
1662 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1663 i += 10;
1664 j += 8;
1665 }
1666 else
1667 {
1668 after[j] = before[i];
1669 i++;
1670 j++;
1671 }
1672 }
1673 *lenp = j;
9c44cd45
MG
1674}
1675
f1ee6d54
LC
1676/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1677 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1678 of the resulting string. BUF must be large enough to handle the
1679 worst case when `\uXXXX' escapes (6 characters) are replaced by
1680 `\xXXXX;' (7 characters). */
31d4d02b
LC
1681static void
1682unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1683{
1684 char *before, *after;
1685 size_t i, j;
1686 /* The worst case is if the input string contains all 4-digit hex escapes.
1687 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1688 size_t max_out_len = (*lenp * 7) / 6 + 1;
1689 size_t nzeros, ndigits;
1690
4ff2b9f4 1691 before = buf;
d31b9519
MG
1692 after = alloca (max_out_len);
1693 i = 0;
1694 j = 0;
1695 while (i < *lenp)
1696 {
1697 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1698 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1699 {
1700 if (before[i + 1] == 'u')
1701 ndigits = 4;
1702 else if (before[i + 1] == 'U')
1703 ndigits = 8;
1704 else
1705 abort ();
1706
1707 /* Add the R6RS hex escape initial sequence. */
1708 after[j] = '\\';
1709 after[j + 1] = 'x';
1710
1711 /* Move string positions to the start of the hex numbers. */
1712 i += 2;
1713 j += 2;
1714
1715 /* Find the number of initial zeros in this hex number. */
1716 nzeros = 0;
1717 while (before[i + nzeros] == '0' && nzeros < ndigits)
1718 nzeros++;
1719
1720 /* Copy the number, skipping initial zeros, and then move the string
1721 positions. */
1722 if (nzeros == ndigits)
1723 {
1724 after[j] = '0';
1725 i += ndigits;
1726 j += 1;
1727 }
1728 else
1729 {
1730 int pos;
1731 for (pos = 0; pos < ndigits - nzeros; pos++)
1732 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1733 i += ndigits;
1734 j += (ndigits - nzeros);
1735 }
1736
1737 /* Add terminating semicolon. */
1738 after[j] = ';';
1739 j++;
1740 }
1741 else
1742 {
1743 after[j] = before[i];
1744 i++;
1745 j++;
1746 }
1747 }
1748 *lenp = j;
d31b9519
MG
1749 memcpy (before, after, j);
1750}
1751
cf313a94 1752char *
d40e1ca8 1753scm_to_locale_string (SCM str)
cf313a94 1754{
d40e1ca8 1755 return scm_to_locale_stringn (str, NULL);
cf313a94 1756}
d31b9519 1757
c829a427 1758char *
fac32b51 1759scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1760{
889975e5 1761 return scm_to_stringn (str, lenp,
95f5e303 1762 locale_charset (),
889975e5 1763 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1764}
1765
d40e1ca8
AW
1766char *
1767scm_to_latin1_string (SCM str)
1768{
1769 return scm_to_latin1_stringn (str, NULL);
1770}
1771
1772char *
1773scm_to_latin1_stringn (SCM str, size_t *lenp)
e9a35a96 1774#define FUNC_NAME "scm_to_latin1_stringn"
d40e1ca8 1775{
e9a35a96
LC
1776 char *result;
1777
1778 SCM_VALIDATE_STRING (1, str);
1779
1780 if (scm_i_is_narrow_string (str))
1781 {
1782 if (lenp)
1783 *lenp = scm_i_string_length (str);
1784
1785 result = scm_strdup (scm_i_string_data (str));
1786 }
1787 else
1788 result = scm_to_stringn (str, lenp, NULL,
1789 SCM_FAILED_CONVERSION_ERROR);
1790
1791 return result;
d40e1ca8 1792}
e9a35a96 1793#undef FUNC_NAME
d40e1ca8
AW
1794
1795char *
1796scm_to_utf8_string (SCM str)
1797{
1798 return scm_to_utf8_stringn (str, NULL);
1799}
1800
1801char *
1802scm_to_utf8_stringn (SCM str, size_t *lenp)
1803{
1804 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1805}
1806
647dc1ac
LC
1807scm_t_wchar *
1808scm_to_utf32_string (SCM str)
1809{
1810 return scm_to_utf32_stringn (str, NULL);
1811}
1812
1813scm_t_wchar *
1814scm_to_utf32_stringn (SCM str, size_t *lenp)
1815#define FUNC_NAME "scm_to_utf32_stringn"
1816{
1817 scm_t_wchar *result;
1818
1819 SCM_VALIDATE_STRING (1, str);
1820
1821 if (scm_i_is_narrow_string (str))
1822 result = (scm_t_wchar *)
1823 scm_to_stringn (str, lenp, "UTF-32",
1824 SCM_FAILED_CONVERSION_ERROR);
1825 else
1826 {
1827 size_t len;
1828
1829 len = scm_i_string_length (str);
1830 if (lenp)
1831 *lenp = len;
1832
1833 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1834 memcpy (result, scm_i_string_wide_chars (str),
1835 len * sizeof (scm_t_wchar));
1836 result[len] = 0;
1837 }
1838
1839 return result;
1840}
1841#undef FUNC_NAME
1842
29bcdbb0
LC
1843/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1844 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1845 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1846 defined by HANDLER. */
9c44cd45 1847char *
587a3355 1848scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1849 scm_t_string_failed_conversion_handler handler)
9c44cd45 1850{
9c44cd45
MG
1851 char *buf;
1852 size_t ilen, len, i;
889975e5
MG
1853 int ret;
1854 const char *enc;
4d4528e7 1855
3ee86942 1856 if (!scm_is_string (str))
c829a427 1857 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1858 ilen = scm_i_string_length (str);
1859
1860 if (ilen == 0)
1861 {
1862 buf = scm_malloc (1);
1863 buf[0] = '\0';
1864 if (lenp)
1865 *lenp = 0;
1866 return buf;
1867 }
587a3355 1868
c829a427 1869 if (lenp == NULL)
9c44cd45
MG
1870 for (i = 0; i < ilen; i++)
1871 if (scm_i_string_ref (str, i) == '\0')
1872 scm_misc_error (NULL,
1873 "string contains #\\nul character: ~S",
1874 scm_list_1 (str));
1875
889975e5 1876 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1877 {
889975e5
MG
1878 /* If using native Latin-1 encoding, just copy the string
1879 contents. */
9c44cd45
MG
1880 if (lenp)
1881 {
1882 buf = scm_malloc (ilen);
1883 memcpy (buf, scm_i_string_chars (str), ilen);
1884 *lenp = ilen;
1885 return buf;
1886 }
1887 else
1888 {
1889 buf = scm_malloc (ilen + 1);
1890 memcpy (buf, scm_i_string_chars (str), ilen);
1891 buf[ilen] = '\0';
1892 return buf;
1893 }
c829a427 1894 }
9c44cd45 1895
587a3355 1896
9c44cd45
MG
1897 buf = NULL;
1898 len = 0;
889975e5
MG
1899 enc = encoding;
1900 if (enc == NULL)
1901 enc = "ISO-8859-1";
1902 if (scm_i_is_narrow_string (str))
1903 {
1904 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1905 "ISO-8859-1", enc,
1906 (enum iconv_ilseq_handler) handler, NULL,
1907 &buf, &len);
9c44cd45 1908
889975e5 1909 if (ret != 0)
ef7e4ba3 1910 scm_encoding_error (__func__, errno,
6851d3be
LC
1911 "cannot convert narrow string to output locale",
1912 SCM_BOOL_F,
1913 /* FIXME: Faulty character unknown. */
1914 SCM_BOOL_F);
889975e5
MG
1915 }
1916 else
1917 {
d31b9519 1918 buf = u32_conv_to_encoding (enc,
889975e5 1919 (enum iconv_ilseq_handler) handler,
d31b9519 1920 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
1921 ilen,
1922 NULL,
1923 NULL, &len);
1924 if (buf == NULL)
ef7e4ba3 1925 scm_encoding_error (__func__, errno,
6851d3be
LC
1926 "cannot convert wide string to output locale",
1927 SCM_BOOL_F,
1928 /* FIXME: Faulty character unknown. */
1929 SCM_BOOL_F);
d31b9519
MG
1930 }
1931 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1932 {
1933 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
1934 {
1935 /* The worst case is if the input string contains all 4-digit
1936 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1937 (seven characters). Make BUF large enough to hold
1938 that. */
1939 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 1940 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 1941 }
d31b9519 1942 else
31d4d02b 1943 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
1944
1945 buf = scm_realloc (buf, len);
889975e5 1946 }
9c44cd45 1947 if (lenp)
4d4528e7 1948 *lenp = len;
9c44cd45
MG
1949 else
1950 {
1951 buf = scm_realloc (buf, len + 1);
1952 buf[len] = '\0';
1953 }
24933780 1954
c829a427 1955 scm_remember_upto_here_1 (str);
9c44cd45 1956 return buf;
4d4528e7 1957}
af68e5e5 1958
c829a427
MV
1959size_t
1960scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1961{
1962 size_t len;
9c44cd45 1963 char *result = NULL;
3ee86942 1964 if (!scm_is_string (str))
c829a427 1965 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1966 result = scm_to_locale_stringn (str, &len);
1967
1968 memcpy (buf, result, (len > max_len) ? max_len : len);
1969 free (result);
1970
c829a427
MV
1971 scm_remember_upto_here_1 (str);
1972 return len;
1973}
af68e5e5 1974
a3d7d5d5
LC
1975\f
1976/* Unicode string normalization. */
1977
edb7bb47
JG
1978/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1979 libguile/i18n.c. It would be useful to have this factored out into a more
1980 convenient location, but its use of alloca makes that tricky to do. */
1981
1982static SCM
1983normalize_str (SCM string, uninorm_t form)
1984{
1985 SCM ret;
1986 scm_t_uint32 *w_str;
1987 scm_t_wchar *cbuf;
1988 size_t rlen, len = scm_i_string_length (string);
1989
1990 if (scm_i_is_narrow_string (string))
1991 {
1992 size_t i;
1993 const char *buf = scm_i_string_chars (string);
1994
1995 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1996
1997 for (i = 0; i < len; i ++)
1998 w_str[i] = (unsigned char) buf[i];
1999 w_str[len] = 0;
2000 }
d8164b04
JG
2001 else
2002 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2003
edb7bb47
JG
2004 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2005
190d4b0d 2006 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
edb7bb47
JG
2007 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2008 free (w_str);
d8164b04
JG
2009
2010 scm_i_try_narrow_string (ret);
2011
edb7bb47
JG
2012 return ret;
2013}
2014
2015SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2016 (SCM string),
2017 "Returns the NFC normalized form of @var{string}.")
2018#define FUNC_NAME s_scm_string_normalize_nfc
2019{
2020 SCM_VALIDATE_STRING (1, string);
2021 return normalize_str (string, UNINORM_NFC);
2022}
2023#undef FUNC_NAME
2024
2025SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2026 (SCM string),
2027 "Returns the NFD normalized form of @var{string}.")
2028#define FUNC_NAME s_scm_string_normalize_nfd
2029{
2030 SCM_VALIDATE_STRING (1, string);
2031 return normalize_str (string, UNINORM_NFD);
2032}
2033#undef FUNC_NAME
2034
2035SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2036 (SCM string),
2037 "Returns the NFKC normalized form of @var{string}.")
2038#define FUNC_NAME s_scm_string_normalize_nfkc
2039{
2040 SCM_VALIDATE_STRING (1, string);
2041 return normalize_str (string, UNINORM_NFKC);
2042}
2043#undef FUNC_NAME
2044
2045SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2046 (SCM string),
2047 "Returns the NFKD normalized form of @var{string}.")
2048#define FUNC_NAME s_scm_string_normalize_nfkd
2049{
2050 SCM_VALIDATE_STRING (1, string);
2051 return normalize_str (string, UNINORM_NFKD);
2052}
2053#undef FUNC_NAME
2054
7505c6e0
MW
2055/* converts C scm_array of strings to SCM scm_list of strings.
2056 If argc < 0, a null terminated scm_array is assumed.
2057 The current locale encoding is assumed */
9c44cd45 2058SCM
3ee86942
MV
2059scm_makfromstrs (int argc, char **argv)
2060{
2061 int i = argc;
2062 SCM lst = SCM_EOL;
2063 if (0 > i)
2064 for (i = 0; argv[i]; i++);
2065 while (i--)
2066 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2067 return lst;
2068}
2069
c829a427 2070/* Return a newly allocated array of char pointers to each of the strings
7505c6e0
MW
2071 in args, with a terminating NULL pointer. The strings are encoded using
2072 the current locale. */
c829a427
MV
2073
2074char **
2075scm_i_allocate_string_pointers (SCM list)
2a776823 2076#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 2077{
c829a427 2078 char **result;
7505c6e0 2079 int list_len = scm_ilength (list);
c829a427
MV
2080 int i;
2081
7505c6e0 2082 if (list_len < 0)
c829a427
MV
2083 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2084
7505c6e0 2085 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2a776823 2086 "string pointers");
7505c6e0 2087 result[list_len] = NULL;
c829a427 2088
7505c6e0 2089 /* The list might have been modified in another thread, so
c829a427
MV
2090 we check LIST before each access.
2091 */
7505c6e0 2092 for (i = 0; i < list_len && scm_is_pair (list); i++)
c829a427 2093 {
7505c6e0
MW
2094 SCM str = SCM_CAR (list);
2095 size_t len; /* String length in bytes */
2096 char *c_str = scm_to_locale_stringn (str, &len);
2097
2098 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2099 scm_malloc to allocate the returned string, which must be
2100 explicitly deallocated. This forces us to copy the string a
2101 second time into a new buffer. Ideally there would be variants
2102 of scm_to_*_stringn that can return garbage-collected buffers. */
2103
2104 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2105 memcpy (result[i], c_str, len);
2a776823 2106 result[i][len] = '\0';
7505c6e0 2107 free (c_str);
2a776823 2108
c829a427
MV
2109 list = SCM_CDR (list);
2110 }
2111
c829a427 2112 return result;
af68e5e5 2113}
2a776823 2114#undef FUNC_NAME
24933780 2115
6f14f578
MV
2116void
2117scm_i_get_substring_spec (size_t len,
2118 SCM start, size_t *cstart,
2119 SCM end, size_t *cend)
2120{
2121 if (SCM_UNBNDP (start))
2122 *cstart = 0;
2123 else
2124 *cstart = scm_to_unsigned_integer (start, 0, len);
2125
2126 if (SCM_UNBNDP (end))
2127 *cend = len;
2128 else
2129 *cend = scm_to_unsigned_integer (end, *cstart, len);
2130}
2131
3ee86942
MV
2132#if SCM_ENABLE_DEPRECATED
2133
556d75db
MV
2134/* When these definitions are removed, it becomes reasonable to use
2135 read-only strings for string literals. For that, change the reader
2136 to create string literals with scm_c_substring_read_only instead of
2137 with scm_c_substring_copy.
2138*/
2139
3ee86942 2140int
fe78c51a 2141scm_i_deprecated_stringp (SCM str)
3ee86942
MV
2142{
2143 scm_c_issue_deprecation_warning
2144 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2145
2616f0e0 2146 return scm_is_string (str);
3ee86942
MV
2147}
2148
2149char *
fe78c51a 2150scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2151{
2152 char *chars;
2153
2154 scm_c_issue_deprecation_warning
2155 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2156
2616f0e0
MV
2157 /* We don't accept shared substrings here since they are not
2158 null-terminated.
2159 */
2160 if (IS_SH_STRING (str))
c291b588
LC
2161 scm_misc_error (NULL,
2162 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2163 SCM_EOL);
2164
877f06c3 2165 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2166 error message.
2167 */
2168
2169 if (IS_RO_STRING (str))
c291b588
LC
2170 scm_misc_error (NULL,
2171 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2172 SCM_EOL);
c291b588 2173
2616f0e0 2174 /* The following is still wrong, of course...
3ee86942 2175 */
9c44cd45 2176 str = scm_i_string_start_writing (str);
3ee86942
MV
2177 chars = scm_i_string_writable_chars (str);
2178 scm_i_string_stop_writing ();
2179 return chars;
2180}
2181
2182size_t
fe78c51a 2183scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2184{
2185 scm_c_issue_deprecation_warning
2186 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2187 return scm_c_string_length (str);
2188}
2189
2190#endif
2191
2a610be5
AW
2192static SCM
2193string_handle_ref (scm_t_array_handle *h, size_t index)
2194{
2195 return scm_c_string_ref (h->array, index);
2196}
2197
2198static void
2199string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2200{
2201 scm_c_string_set_x (h->array, index, val);
2202}
2203
2204static void
2205string_get_handle (SCM v, scm_t_array_handle *h)
2206{
2207 h->array = v;
2208 h->ndims = 1;
2209 h->dims = &h->dim0;
2210 h->dim0.lbnd = 0;
2211 h->dim0.ubnd = scm_c_string_length (v) - 1;
2212 h->dim0.inc = 1;
2213 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2214 h->elements = h->writable_elements = NULL;
2215}
2216
c5f17102 2217SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2218 string_handle_ref, string_handle_set,
f65e0168
LC
2219 string_get_handle)
2220SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2221
0f2d19dd
JB
2222void
2223scm_init_strings ()
0f2d19dd 2224{
190d4b0d 2225 scm_nullstr = scm_i_make_string (0, NULL, 1);
7c33806a 2226
a0599745 2227#include "libguile/strings.x"
0f2d19dd
JB
2228}
2229
89e00824
ML
2230
2231/*
2232 Local Variables:
2233 c-file-style: "gnu"
2234 End:
2235*/