Hide the string escaping hacks.
[bpt/guile.git] / libguile / strings.c
CommitLineData
d40e1ca8 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
faf2c9d7 32
889975e5
MG
33#include "striconveh.h"
34
a0599745
MD
35#include "libguile/_scm.h"
36#include "libguile/chars.h"
7c33806a 37#include "libguile/root.h"
a0599745 38#include "libguile/strings.h"
a3d7d5d5 39#include "libguile/error.h"
f45eccff 40#include "libguile/generalized-vectors.h"
1afff620 41#include "libguile/deprecation.h"
a0599745 42#include "libguile/validate.h"
d31b9519 43#include "libguile/private-options.h"
1afff620 44
0f2d19dd
JB
45\f
46
47/* {Strings}
48 */
49
3ee86942
MV
50
51/* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 55 * shared, not whether it definitely is.
3ee86942
MV
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
50b1996f 68 *
ba54a202
LC
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
3ee86942
MV
72 */
73
ba54a202
LC
74/* The size in words of the stringbuf header (type tag + size). */
75#define STRINGBUF_HEADER_SIZE 2U
76
77#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
35920c00 79#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 80#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
81
82#define STRINGBUF_TAG scm_tc7_stringbuf
83#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 84#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 85
100e20c7 86#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
100e20c7
LC
89#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 91
100e20c7 92#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 93
9b41542f
LC
94#define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
3ee86942 103
56a3dcd4 104#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
105static size_t lenhist[1001];
106#endif
107
50b1996f
MG
108/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
3ee86942
MV
110static SCM
111make_stringbuf (size_t len)
0f2d19dd 112{
3ee86942
MV
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
ba54a202
LC
120 SCM buf;
121
56a3dcd4 122#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127#endif
0f2d19dd 128
ba54a202
LC
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
3ee86942 138}
e53cc817 139
50b1996f
MG
140/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
9c44cd45
MG
142static SCM
143make_wide_stringbuf (size_t len)
144{
ba54a202
LC
145 SCM buf;
146 size_t raw_len;
147
56a3dcd4 148#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153#endif
154
ba54a202
LC
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
9c44cd45
MG
165}
166
ba54a202
LC
167/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169static SCM
170wide_stringbuf (SCM buf)
9c44cd45 171{
ba54a202 172 SCM new_buf;
9c44cd45
MG
173
174 if (STRINGBUF_WIDE (buf))
ba54a202
LC
175 new_buf = buf;
176 else
9c44cd45 177 {
ba54a202
LC
178 size_t i, len;
179 scm_t_wchar *mem;
9c44cd45 180
ba54a202 181 len = STRINGBUF_LENGTH (buf);
9c44cd45 182
ba54a202 183 new_buf = make_wide_stringbuf (len);
9c44cd45 184
ba54a202 185 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 186 for (i = 0; i < len; i++)
ba54a202 187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 188 mem[len] = 0;
9c44cd45 189 }
ba54a202
LC
190
191 return new_buf;
3ee86942 192}
bd9e24b3 193
ba54a202
LC
194/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196static SCM
587a3355
MG
197narrow_stringbuf (SCM buf)
198{
ba54a202 199 SCM new_buf;
587a3355
MG
200
201 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
587a3355 208
ba54a202
LC
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 211
ba54a202
LC
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
587a3355 216
ba54a202 217 new_buf = make_stringbuf (len);
587a3355 218
ba54a202
LC
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
587a3355
MG
226}
227
9de87eea 228scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 229
ba54a202 230\f
3ee86942
MV
231/* Copy-on-write strings.
232 */
bd9e24b3 233
3ee86942 234#define STRING_TAG scm_tc7_string
bd9e24b3 235
3ee86942
MV
236#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 239
3ee86942
MV
240#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
ed35de72
MV
245/* Read-only strings.
246 */
247
35920c00 248#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
249#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
e1b29f6a
MV
251/* Mutation-sharing substrings
252 */
253
254#define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257/* START and LENGTH as for STRINGs. */
258
259#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
e7efe8e7
AW
261SCM scm_nullstr;
262
50b1996f
MG
263/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. */
3ee86942
MV
266SCM
267scm_i_make_string (size_t len, char **charsp)
268{
269 SCM buf = make_stringbuf (len);
270 SCM res;
271 if (charsp)
f59cf998 272 *charsp = (char *) STRINGBUF_CHARS (buf);
3ee86942
MV
273 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
274 (scm_t_bits)0, (scm_t_bits) len);
275 return res;
0f2d19dd
JB
276}
277
50b1996f
MG
278/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
279 characters. CHARSP, if not NULL, will be set to location of the
280 character array. */
9c44cd45 281SCM
50b1996f 282scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
283{
284 SCM buf = make_wide_stringbuf (len);
285 SCM res;
286 if (charsp)
287 *charsp = STRINGBUF_WIDE_CHARS (buf);
288 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
289 (scm_t_bits) 0, (scm_t_bits) len);
290 return res;
291}
292
3ee86942
MV
293static void
294validate_substring_args (SCM str, size_t start, size_t end)
295{
296 if (!IS_STRING (str))
297 scm_wrong_type_arg_msg (NULL, 0, str, "string");
298 if (start > STRING_LENGTH (str))
299 scm_out_of_range (NULL, scm_from_size_t (start));
300 if (end > STRING_LENGTH (str) || end < start)
301 scm_out_of_range (NULL, scm_from_size_t (end));
302}
0f2d19dd 303
e1b29f6a
MV
304static inline void
305get_str_buf_start (SCM *str, SCM *buf, size_t *start)
306{
307 *start = STRING_START (*str);
308 if (IS_SH_STRING (*str))
309 {
310 *str = SH_STRING_STRING (*str);
311 *start += STRING_START (*str);
312 }
313 *buf = STRING_STRINGBUF (*str);
314}
315
3ee86942
MV
316SCM
317scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 318{
e1b29f6a
MV
319 SCM buf;
320 size_t str_start;
321 get_str_buf_start (&str, &buf, &str_start);
9de87eea 322 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 323 SET_STRINGBUF_SHARED (buf);
9de87eea 324 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 325 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
326 (scm_t_bits)str_start + start,
327 (scm_t_bits) end - start);
0f2d19dd
JB
328}
329
ed35de72
MV
330SCM
331scm_i_substring_read_only (SCM str, size_t start, size_t end)
332{
45a9f430
LC
333 SCM buf;
334 size_t str_start;
335 get_str_buf_start (&str, &buf, &str_start);
336 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
337 SET_STRINGBUF_SHARED (buf);
338 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
339 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
340 (scm_t_bits)str_start + start,
341 (scm_t_bits) end - start);
ed35de72
MV
342}
343
3ee86942
MV
344SCM
345scm_i_substring_copy (SCM str, size_t start, size_t end)
346{
347 size_t len = end - start;
edea856c 348 SCM buf, my_buf;
e1b29f6a
MV
349 size_t str_start;
350 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
351 if (scm_i_is_narrow_string (str))
352 {
353 my_buf = make_stringbuf (len);
354 memcpy (STRINGBUF_CHARS (my_buf),
355 STRINGBUF_CHARS (buf) + str_start + start, len);
356 }
357 else
358 {
359 my_buf = make_wide_stringbuf (len);
360 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
361 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
362 + start), len);
363 /* Even though this string is wide, the substring may be narrow.
50b1996f 364 Consider adding code to narrow the string. */
9c44cd45 365 }
3ee86942 366 scm_remember_upto_here_1 (buf);
9c44cd45
MG
367 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
368 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 369}
0f2d19dd 370
e1b29f6a
MV
371SCM
372scm_i_substring_shared (SCM str, size_t start, size_t end)
373{
374 if (start == 0 && end == STRING_LENGTH (str))
375 return str;
376 else
377 {
378 size_t len = end - start;
379 if (IS_SH_STRING (str))
380 {
381 start += STRING_START (str);
382 str = SH_STRING_STRING (str);
383 }
384 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
385 (scm_t_bits)start, (scm_t_bits) len);
386 }
387}
388
3ee86942
MV
389SCM
390scm_c_substring (SCM str, size_t start, size_t end)
391{
392 validate_substring_args (str, start, end);
393 return scm_i_substring (str, start, end);
394}
ee149d03 395
ed35de72
MV
396SCM
397scm_c_substring_read_only (SCM str, size_t start, size_t end)
398{
399 validate_substring_args (str, start, end);
400 return scm_i_substring_read_only (str, start, end);
401}
402
0f2d19dd 403SCM
3ee86942 404scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 405{
3ee86942
MV
406 validate_substring_args (str, start, end);
407 return scm_i_substring_copy (str, start, end);
408}
409
3ee86942
MV
410SCM
411scm_c_substring_shared (SCM str, size_t start, size_t end)
412{
413 validate_substring_args (str, start, end);
414 return scm_i_substring_shared (str, start, end);
415}
0f2d19dd 416
d6c74168 417\f
3ee86942
MV
418/* Internal accessors
419 */
420
50b1996f
MG
421/* Returns the number of characters in STR. This may be different
422 than the memory size of the string storage. */
3ee86942
MV
423size_t
424scm_i_string_length (SCM str)
0f2d19dd 425{
3ee86942 426 return STRING_LENGTH (str);
0f2d19dd
JB
427}
428
50b1996f
MG
429/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
430 encoding. False if it is 'wide', having a 32-bit UCS-4
431 encoding. */
9c44cd45
MG
432int
433scm_i_is_narrow_string (SCM str)
434{
435 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
436}
437
587a3355
MG
438/* Try to coerce a string to be narrow. It if is narrow already, do
439 nothing. If it is wide, shrink it to narrow if none of its
440 characters are above 0xFF. Return true if the string is narrow or
441 was made to be narrow. */
442int
443scm_i_try_narrow_string (SCM str)
444{
ba54a202 445 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
446
447 return scm_i_is_narrow_string (str);
448}
449
100e20c7
LC
450/* Return a pointer to the raw data of the string, which can be either Latin-1
451 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
452const void *
453scm_i_string_data (SCM str)
454{
455 SCM buf;
456 size_t start;
457 const char *data;
458
459 get_str_buf_start (&str, &buf, &start);
460
461 data = STRINGBUF_CONTENTS (buf);
462 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
463
464 return data;
465}
466
50b1996f
MG
467/* Returns a pointer to the 8-bit Latin-1 encoded character array of
468 STR. */
3ee86942
MV
469const char *
470scm_i_string_chars (SCM str)
471{
472 SCM buf;
e1b29f6a
MV
473 size_t start;
474 get_str_buf_start (&str, &buf, &start);
9c44cd45 475 if (scm_i_is_narrow_string (str))
f59cf998 476 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
477 else
478 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
479 scm_list_1 (str));
480 return NULL;
3ee86942 481}
b00418df 482
50b1996f
MG
483/* Returns a pointer to the 32-bit UCS-4 encoded character array of
484 STR. */
9c44cd45
MG
485const scm_t_wchar *
486scm_i_string_wide_chars (SCM str)
487{
488 SCM buf;
489 size_t start;
490
491 get_str_buf_start (&str, &buf, &start);
492 if (!scm_i_is_narrow_string (str))
f59cf998 493 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
494 else
495 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
496 scm_list_1 (str));
497}
498
499/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
500 a new string buffer, so that it can be modified without modifying
50b1996f
MG
501 other strings. Also, lock the string mutex. Later, one must call
502 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
503SCM
504scm_i_string_start_writing (SCM orig_str)
b00418df 505{
ed35de72 506 SCM buf, str = orig_str;
e1b29f6a 507 size_t start;
ed35de72 508
e1b29f6a 509 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
510 if (IS_RO_STRING (str))
511 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
512
9de87eea 513 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
514 if (STRINGBUF_SHARED (buf))
515 {
9c44cd45 516 /* Clone the stringbuf. */
3ee86942
MV
517 size_t len = STRING_LENGTH (str);
518 SCM new_buf;
519
9de87eea 520 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 521
9c44cd45
MG
522 if (scm_i_is_narrow_string (str))
523 {
524 new_buf = make_stringbuf (len);
525 memcpy (STRINGBUF_CHARS (new_buf),
526 STRINGBUF_CHARS (buf) + STRING_START (str), len);
527
528 }
529 else
530 {
531 new_buf = make_wide_stringbuf (len);
532 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
533 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
534 + STRING_START (str)), len);
535 }
3ee86942 536
3ee86942
MV
537 SET_STRING_STRINGBUF (str, new_buf);
538 start -= STRING_START (str);
902578f1
LC
539
540 /* FIXME: The following operations are not atomic, so other threads
541 looking at STR may see an inconsistent state. Nevertheless it can't
542 hurt much since (i) accessing STR while it is being mutated can't
543 yield a crash, and (ii) concurrent accesses to STR should be
544 protected by a mutex at the application level. The latter may not
545 apply when STR != ORIG_STR, though. */
3ee86942 546 SET_STRING_START (str, 0);
902578f1 547 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
548
549 buf = new_buf;
550
9de87eea 551 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 552 }
9c44cd45
MG
553 return orig_str;
554}
3ee86942 555
50b1996f 556/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
557char *
558scm_i_string_writable_chars (SCM str)
559{
560 SCM buf;
561 size_t start;
3ee86942 562
9c44cd45
MG
563 get_str_buf_start (&str, &buf, &start);
564 if (scm_i_is_narrow_string (str))
f59cf998 565 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
566 else
567 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
568 scm_list_1 (str));
569 return NULL;
b00418df
DH
570}
571
50b1996f 572/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
573static scm_t_wchar *
574scm_i_string_writable_wide_chars (SCM str)
575{
576 SCM buf;
577 size_t start;
578
579 get_str_buf_start (&str, &buf, &start);
580 if (!scm_i_is_narrow_string (str))
581 return STRINGBUF_WIDE_CHARS (buf) + start;
582 else
1c7b216f 583 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 584 scm_list_1 (str));
b00418df
DH
585}
586
50b1996f
MG
587/* Unlock the string mutex that was locked when
588 scm_i_string_start_writing was called. */
3ee86942
MV
589void
590scm_i_string_stop_writing (void)
591{
9de87eea 592 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 593}
b00418df 594
50b1996f 595/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
596scm_t_wchar
597scm_i_string_ref (SCM str, size_t x)
598{
599 if (scm_i_is_narrow_string (str))
600 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
601 else
602 return scm_i_string_wide_chars (str)[x];
603}
604
889975e5
MG
605/* Returns index+1 of the first char in STR that matches C, or
606 0 if the char is not found. */
607int
608scm_i_string_contains_char (SCM str, char ch)
609{
610 size_t i;
611 size_t len = scm_i_string_length (str);
612
613 i = 0;
614 if (scm_i_is_narrow_string (str))
615 {
616 while (i < len)
617 {
618 if (scm_i_string_chars (str)[i] == ch)
619 return i+1;
620 i++;
621 }
622 }
623 else
624 {
625 while (i < len)
626 {
627 if (scm_i_string_wide_chars (str)[i]
628 == (unsigned char) ch)
629 return i+1;
630 i++;
631 }
632 }
633 return 0;
634}
635
3f47e526
MG
636int
637scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
638{
639 if (scm_i_is_narrow_string (sstr))
640 {
641 const char *a = scm_i_string_chars (sstr) + start_x;
642 const char *b = cstr;
643 return strncmp (a, b, strlen(b));
644 }
645 else
646 {
647 size_t i;
648 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
649 const char *b = cstr;
650 for (i = 0; i < strlen (b); i++)
651 {
652 if (a[i] != (unsigned char) b[i])
653 return 1;
654 }
655 }
656 return 0;
657}
658
50b1996f 659/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
660void
661scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
662{
663 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 664 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
665
666 if (scm_i_is_narrow_string (str))
667 {
668 char *dst = scm_i_string_writable_chars (str);
587a3355 669 dst[p] = chr;
9c44cd45
MG
670 }
671 else
672 {
673 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
674 dst[p] = chr;
675 }
676}
677
ba54a202 678\f
3ee86942 679/* Symbols.
587a3355 680
3ee86942
MV
681 Basic symbol creation and accessing is done here, the rest is in
682 symbols.[hc]. This has been done to keep stringbufs and the
683 internals of strings and string-like objects confined to this file.
684*/
685
686#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
687
688SCM
6869328b
MV
689scm_i_make_symbol (SCM name, scm_t_bits flags,
690 unsigned long hash, SCM props)
3ee86942
MV
691{
692 SCM buf;
693 size_t start = STRING_START (name);
694 size_t length = STRING_LENGTH (name);
695
696 if (IS_SH_STRING (name))
697 {
698 name = SH_STRING_STRING (name);
699 start += STRING_START (name);
700 }
701 buf = SYMBOL_STRINGBUF (name);
702
703 if (start == 0 && length == STRINGBUF_LENGTH (buf))
704 {
705 /* reuse buf. */
9de87eea 706 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 707 SET_STRINGBUF_SHARED (buf);
9de87eea 708 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
709 }
710 else
711 {
712 /* make new buf. */
9c44cd45
MG
713 if (scm_i_is_narrow_string (name))
714 {
715 SCM new_buf = make_stringbuf (length);
716 memcpy (STRINGBUF_CHARS (new_buf),
717 STRINGBUF_CHARS (buf) + start, length);
718 buf = new_buf;
719 }
720 else
721 {
722 SCM new_buf = make_wide_stringbuf (length);
723 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
724 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
725 length);
726 buf = new_buf;
727 }
3ee86942 728 }
6869328b 729 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
730 (scm_t_bits) hash, SCM_UNPACK (props));
731}
732
fd0a5bbc
HWN
733SCM
734scm_i_c_make_symbol (const char *name, size_t len,
735 scm_t_bits flags, unsigned long hash, SCM props)
736{
737 SCM buf = make_stringbuf (len);
738 memcpy (STRINGBUF_CHARS (buf), name, len);
739
a284cc7e
LC
740 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
741 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
742}
743
50b1996f
MG
744/* Returns the number of characters in SYM. This may be different
745 from the memory size of SYM. */
3ee86942
MV
746size_t
747scm_i_symbol_length (SCM sym)
0f2d19dd 748{
3ee86942 749 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
750}
751
071bb6a8
LC
752size_t
753scm_c_symbol_length (SCM sym)
754#define FUNC_NAME "scm_c_symbol_length"
755{
756 SCM_VALIDATE_SYMBOL (1, sym);
757
758 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
759}
760#undef FUNC_NAME
761
50b1996f
MG
762/* True if the name of SYM is stored as a Latin-1 encoded string.
763 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
764int
765scm_i_is_narrow_symbol (SCM sym)
766{
767 SCM buf;
768
769 buf = SYMBOL_STRINGBUF (sym);
770 return !STRINGBUF_WIDE (buf);
771}
772
50b1996f
MG
773/* Returns a pointer to the 8-bit Latin-1 encoded character array that
774 contains the name of SYM. */
3ee86942
MV
775const char *
776scm_i_symbol_chars (SCM sym)
777{
9c44cd45
MG
778 SCM buf;
779
780 buf = SYMBOL_STRINGBUF (sym);
781 if (!STRINGBUF_WIDE (buf))
f59cf998 782 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
783 else
784 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
785 scm_list_1 (sym));
786}
787
50b1996f
MG
788/* Return a pointer to the 32-bit UCS-4-encoded character array of a
789 symbol's name. */
9c44cd45
MG
790const scm_t_wchar *
791scm_i_symbol_wide_chars (SCM sym)
792{
793 SCM buf;
794
795 buf = SYMBOL_STRINGBUF (sym);
796 if (STRINGBUF_WIDE (buf))
f59cf998 797 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
798 else
799 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
800 scm_list_1 (sym));
3ee86942 801}
1cc91f1b 802
be54b15d 803SCM
3ee86942 804scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 805{
3ee86942 806 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 807 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 808 SET_STRINGBUF_SHARED (buf);
9de87eea 809 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 810 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
811 (scm_t_bits)start, (scm_t_bits) end - start);
812}
be54b15d 813
50b1996f 814/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
815scm_t_wchar
816scm_i_symbol_ref (SCM sym, size_t x)
817{
818 if (scm_i_is_narrow_symbol (sym))
819 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
820 else
821 return scm_i_symbol_wide_chars (sym)[x];
822}
823
3ee86942
MV
824/* Debugging
825 */
be54b15d 826
6ce6923b
MG
827SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
828 "Returns an association list containing debugging information\n"
829 "for @var{str}. The association list has the following entries."
830 "@table @code\n"
831 "@item string\n"
832 "The string itself.\n"
833 "@item start\n"
834 "The start index of the string into its stringbuf\n"
835 "@item length\n"
836 "The length of the string\n"
837 "@item shared\n"
838 "If this string is a substring, it returns its parent string.\n"
839 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
840 "@item read-only\n"
841 "@code{#t} if the string is read-only\n"
6ce6923b
MG
842 "@item stringbuf-chars\n"
843 "A new string containing this string's stringbuf's characters\n"
844 "@item stringbuf-length\n"
845 "The number of characters in this stringbuf\n"
846 "@item stringbuf-shared\n"
847 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
848 "@item stringbuf-wide\n"
849 "@code{#t} if this stringbuf's characters are stored in a\n"
850 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
851 "buffer\n"
852 "@end table")
3ee86942
MV
853#define FUNC_NAME s_scm_sys_string_dump
854{
ba54a202 855 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 856 SCM buf;
3ee86942 857 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
858
859 /* String info */
4a655e50 860 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 861 str);
4a655e50 862 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 863 scm_from_size_t (STRING_START (str)));
4a655e50 864 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
865 scm_from_size_t (STRING_LENGTH (str)));
866
3ee86942
MV
867 if (IS_SH_STRING (str))
868 {
4a655e50 869 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
870 SH_STRING_STRING (str));
871 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
872 }
873 else
874 {
4a655e50 875 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
876 SCM_BOOL_F);
877 buf = STRING_STRINGBUF (str);
3ee86942 878 }
9c44cd45 879
88ed5759 880 if (IS_RO_STRING (str))
4a655e50 881 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
882 SCM_BOOL_T);
883 else
4a655e50 884 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 885 SCM_BOOL_F);
587a3355 886
6ce6923b 887 /* Stringbuf info */
6ce6923b
MG
888 if (!STRINGBUF_WIDE (buf))
889 {
890 size_t len = STRINGBUF_LENGTH (buf);
891 char *cbuf;
892 SCM sbc = scm_i_make_string (len, &cbuf);
893 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 894 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 895 sbc);
3ee86942 896 }
6ce6923b
MG
897 else
898 {
899 size_t len = STRINGBUF_LENGTH (buf);
900 scm_t_wchar *cbuf;
901 SCM sbc = scm_i_make_wide_string (len, &cbuf);
902 u32_cpy ((scm_t_uint32 *) cbuf,
903 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 904 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
905 sbc);
906 }
4a655e50 907 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
908 scm_from_size_t (STRINGBUF_LENGTH (buf)));
909 if (STRINGBUF_SHARED (buf))
4a655e50 910 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
911 SCM_BOOL_T);
912 else
4a655e50 913 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 914 SCM_BOOL_F);
6ce6923b 915 if (STRINGBUF_WIDE (buf))
4a655e50 916 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 917 SCM_BOOL_T);
6ce6923b 918 else
4a655e50 919 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 920 SCM_BOOL_F);
6ce6923b 921
ba54a202 922 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
923}
924#undef FUNC_NAME
925
6ce6923b
MG
926SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
927 "Returns an association list containing debugging information\n"
928 "for @var{sym}. The association list has the following entries."
929 "@table @code\n"
930 "@item symbol\n"
931 "The symbol itself\n"
932 "@item hash\n"
933 "Its hash value\n"
88ed5759
MG
934 "@item interned\n"
935 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
936 "@item stringbuf-chars\n"
937 "A new string containing this symbols's stringbuf's characters\n"
938 "@item stringbuf-length\n"
939 "The number of characters in this stringbuf\n"
940 "@item stringbuf-shared\n"
941 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
942 "@item stringbuf-wide\n"
943 "@code{#t} if this stringbuf's characters are stored in a\n"
944 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
945 "buffer\n"
946 "@end table")
3ee86942
MV
947#define FUNC_NAME s_scm_sys_symbol_dump
948{
ba54a202 949 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 950 SCM buf;
3ee86942 951 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 952 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 953 sym);
4a655e50 954 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 955 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 956 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 957 scm_symbol_interned_p (sym));
6ce6923b
MG
958 buf = SYMBOL_STRINGBUF (sym);
959
960 /* Stringbuf info */
6ce6923b
MG
961 if (!STRINGBUF_WIDE (buf))
962 {
963 size_t len = STRINGBUF_LENGTH (buf);
964 char *cbuf;
965 SCM sbc = scm_i_make_string (len, &cbuf);
966 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 967 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
968 sbc);
969 }
9c44cd45 970 else
6ce6923b
MG
971 {
972 size_t len = STRINGBUF_LENGTH (buf);
973 scm_t_wchar *cbuf;
974 SCM sbc = scm_i_make_wide_string (len, &cbuf);
975 u32_cpy ((scm_t_uint32 *) cbuf,
976 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 977 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
978 sbc);
979 }
4a655e50 980 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
981 scm_from_size_t (STRINGBUF_LENGTH (buf)));
982 if (STRINGBUF_SHARED (buf))
4a655e50 983 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
984 SCM_BOOL_T);
985 else
4a655e50 986 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 987 SCM_BOOL_F);
6ce6923b 988 if (STRINGBUF_WIDE (buf))
4a655e50 989 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
990 SCM_BOOL_T);
991 else
4a655e50 992 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 993 SCM_BOOL_F);
ba54a202 994 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 995
3ee86942
MV
996}
997#undef FUNC_NAME
998
56a3dcd4 999#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1000
9c44cd45 1001SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1002#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1003{
1004 int i;
1005 for (i = 0; i < 1000; i++)
1006 if (lenhist[i])
1007 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1008 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1009 return SCM_UNSPECIFIED;
be54b15d
DH
1010}
1011#undef FUNC_NAME
1012
3ee86942
MV
1013#endif
1014
1015\f
1016
1017SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1018 (SCM obj),
1019 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1020#define FUNC_NAME s_scm_string_p
1021{
1022 return scm_from_bool (IS_STRING (obj));
1023}
1024#undef FUNC_NAME
1025
1026
1027SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1028
1029SCM_DEFINE (scm_string, "string", 0, 0, 1,
1030 (SCM chrs),
1031 "@deffnx {Scheme Procedure} list->string chrs\n"
1032 "Return a newly allocated string composed of the arguments,\n"
1033 "@var{chrs}.")
1034#define FUNC_NAME s_scm_string
1035{
9aa27c1a 1036 SCM result = SCM_BOOL_F;
9c44cd45 1037 SCM rest;
3ee86942 1038 size_t len;
9c44cd45
MG
1039 size_t p = 0;
1040 long i;
9aa27c1a 1041 int wide = 0;
3ee86942 1042
9c44cd45
MG
1043 /* Verify that this is a list of chars. */
1044 i = scm_ilength (chrs);
3c7cf7f5 1045 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1046
9c44cd45
MG
1047 len = (size_t) i;
1048 rest = chrs;
3ee86942 1049
9c44cd45 1050 while (len > 0 && scm_is_pair (rest))
3ee86942 1051 {
9c44cd45 1052 SCM elt = SCM_CAR (rest);
3ee86942 1053 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1054 if (SCM_CHAR (elt) > 0xFF)
1055 wide = 1;
9c44cd45
MG
1056 rest = SCM_CDR (rest);
1057 len--;
1058 scm_remember_upto_here_1 (elt);
1059 }
1060
1061 /* Construct a string containing this list of chars. */
1062 len = (size_t) i;
1063 rest = chrs;
1064
9aa27c1a 1065 if (wide == 0)
9c44cd45 1066 {
56a3dcd4
LC
1067 char *buf;
1068
9aa27c1a
MG
1069 result = scm_i_make_string (len, NULL);
1070 result = scm_i_string_start_writing (result);
56a3dcd4 1071 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1072 while (len > 0 && scm_is_pair (rest))
1073 {
1074 SCM elt = SCM_CAR (rest);
1075 buf[p] = (unsigned char) SCM_CHAR (elt);
1076 p++;
1077 rest = SCM_CDR (rest);
1078 len--;
1079 scm_remember_upto_here_1 (elt);
1080 }
1081 }
1082 else
1083 {
56a3dcd4
LC
1084 scm_t_wchar *buf;
1085
9aa27c1a
MG
1086 result = scm_i_make_wide_string (len, NULL);
1087 result = scm_i_string_start_writing (result);
56a3dcd4 1088 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1089 while (len > 0 && scm_is_pair (rest))
1090 {
1091 SCM elt = SCM_CAR (rest);
1092 buf[p] = SCM_CHAR (elt);
1093 p++;
1094 rest = SCM_CDR (rest);
1095 len--;
1096 scm_remember_upto_here_1 (elt);
1097 }
3ee86942 1098 }
9c44cd45
MG
1099 scm_i_string_stop_writing ();
1100
3ee86942
MV
1101 if (len > 0)
1102 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1103 if (!scm_is_null (rest))
3ee86942
MV
1104 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1105
1106 return result;
1107}
1108#undef FUNC_NAME
be54b15d 1109
3b3b36dd 1110SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1111 (SCM k, SCM chr),
0d26a824
MG
1112 "Return a newly allocated string of\n"
1113 "length @var{k}. If @var{chr} is given, then all elements of\n"
1114 "the string are initialized to @var{chr}, otherwise the contents\n"
3ef6650d 1115 "of the @var{string} are all set to @var{#\nul}.")
1bbd0b84 1116#define FUNC_NAME s_scm_make_string
0f2d19dd 1117{
3ee86942
MV
1118 return scm_c_make_string (scm_to_size_t (k), chr);
1119}
1120#undef FUNC_NAME
1121
1122SCM
1123scm_c_make_string (size_t len, SCM chr)
1124#define FUNC_NAME NULL
1125{
9c44cd45 1126 size_t p;
3ef6650d
AW
1127 char *contents = NULL;
1128 SCM res = scm_i_make_string (len, &contents);
cb0d8be2 1129
3ef6650d
AW
1130 /* If no char is given, initialize string contents to NULL. */
1131 if (SCM_UNBNDP (chr))
1132 memset (contents, 0, len);
1133 else
e11e83f3 1134 {
3ee86942 1135 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1136 res = scm_i_string_start_writing (res);
1137 for (p = 0; p < len; p++)
1138 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1139 scm_i_string_stop_writing ();
0f2d19dd 1140 }
e11e83f3
MV
1141
1142 return res;
0f2d19dd 1143}
1bbd0b84 1144#undef FUNC_NAME
0f2d19dd 1145
3b3b36dd 1146SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1147 (SCM string),
1148 "Return the number of characters in @var{string}.")
1bbd0b84 1149#define FUNC_NAME s_scm_string_length
0f2d19dd 1150{
d1ca2c64 1151 SCM_VALIDATE_STRING (1, string);
3ee86942 1152 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1153}
1bbd0b84 1154#undef FUNC_NAME
0f2d19dd 1155
f8ba2bb9 1156SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1157 (SCM string),
1158 "Return the bytes used to represent a character in @var{string}."
1159 "This will return 1 or 4.")
f8ba2bb9 1160#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1161{
1162 SCM_VALIDATE_STRING (1, string);
1163 if (!scm_i_is_narrow_string (string))
1164 return scm_from_int (4);
1165
1166 return scm_from_int (1);
1167}
1168#undef FUNC_NAME
1169
3ee86942
MV
1170size_t
1171scm_c_string_length (SCM string)
1172{
1173 if (!IS_STRING (string))
1174 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1175 return STRING_LENGTH (string);
1176}
1177
bd9e24b3 1178SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1179 (SCM str, SCM k),
9c44cd45
MG
1180 "Return character @var{k} of @var{str} using zero-origin\n"
1181 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1182#define FUNC_NAME s_scm_string_ref
0f2d19dd 1183{
3ae3166b 1184 size_t len;
a55c2b68 1185 unsigned long idx;
bd9e24b3 1186
d1ca2c64 1187 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1188
1189 len = scm_i_string_length (str);
1190 if (SCM_LIKELY (len > 0))
1191 idx = scm_to_unsigned_integer (k, 0, len - 1);
1192 else
1193 scm_out_of_range (NULL, k);
1194
9c44cd45
MG
1195 if (scm_i_is_narrow_string (str))
1196 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1197 else
1198 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1199}
1bbd0b84 1200#undef FUNC_NAME
0f2d19dd 1201
3ee86942
MV
1202SCM
1203scm_c_string_ref (SCM str, size_t p)
1204{
1205 if (p >= scm_i_string_length (str))
1206 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1207 if (scm_i_is_narrow_string (str))
1208 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1209 else
1210 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1211
3ee86942 1212}
f0942910 1213
3b3b36dd 1214SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1215 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1216 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1217 "an unspecified value. @var{k} must be a valid index of\n"
1218 "@var{str}.")
1bbd0b84 1219#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1220{
3ae3166b 1221 size_t len;
a55c2b68
MV
1222 unsigned long idx;
1223
f0942910 1224 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1225
1226 len = scm_i_string_length (str);
1227 if (SCM_LIKELY (len > 0))
1228 idx = scm_to_unsigned_integer (k, 0, len - 1);
1229 else
1230 scm_out_of_range (NULL, k);
1231
34d19ef6 1232 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1233 str = scm_i_string_start_writing (str);
1234 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1235 scm_i_string_stop_writing ();
1236
0f2d19dd
JB
1237 return SCM_UNSPECIFIED;
1238}
1bbd0b84 1239#undef FUNC_NAME
0f2d19dd 1240
3ee86942
MV
1241void
1242scm_c_string_set_x (SCM str, size_t p, SCM chr)
1243{
1244 if (p >= scm_i_string_length (str))
1245 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1246 str = scm_i_string_start_writing (str);
1247 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1248 scm_i_string_stop_writing ();
3ee86942 1249}
0f2d19dd 1250
3b3b36dd 1251SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1252 (SCM str, SCM start, SCM end),
1253 "Return a newly allocated string formed from the characters\n"
1254 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1255 "ending with index @var{end} (exclusive).\n"
1256 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1257 "exact integers satisfying:\n\n"
1258 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1259#define FUNC_NAME s_scm_substring
0f2d19dd 1260{
3ee86942 1261 size_t len, from, to;
685c0d71 1262
d1ca2c64 1263 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1264 len = scm_i_string_length (str);
1265 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1266 if (SCM_UNBNDP (end))
3ee86942 1267 to = len;
a55c2b68 1268 else
3ee86942
MV
1269 to = scm_to_unsigned_integer (end, from, len);
1270 return scm_i_substring (str, from, to);
0f2d19dd 1271}
1bbd0b84 1272#undef FUNC_NAME
0f2d19dd 1273
ed35de72
MV
1274SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1275 (SCM str, SCM start, SCM end),
1276 "Return a newly allocated string formed from the characters\n"
1277 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1278 "ending with index @var{end} (exclusive).\n"
1279 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1280 "exact integers satisfying:\n"
1281 "\n"
1282 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1283 "\n"
1284 "The returned string is read-only.\n")
1285#define FUNC_NAME s_scm_substring_read_only
1286{
1287 size_t len, from, to;
1288
1289 SCM_VALIDATE_STRING (1, str);
1290 len = scm_i_string_length (str);
1291 from = scm_to_unsigned_integer (start, 0, len);
1292 if (SCM_UNBNDP (end))
1293 to = len;
1294 else
1295 to = scm_to_unsigned_integer (end, from, len);
1296 return scm_i_substring_read_only (str, from, to);
1297}
1298#undef FUNC_NAME
1299
3ee86942
MV
1300SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1301 (SCM str, SCM start, SCM end),
1302 "Return a newly allocated string formed from the characters\n"
1303 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1304 "ending with index @var{end} (exclusive).\n"
1305 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1306 "exact integers satisfying:\n\n"
1307 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1308#define FUNC_NAME s_scm_substring_copy
1309{
e1b29f6a
MV
1310 /* For the Scheme version, START is mandatory, but for the C
1311 version, it is optional. See scm_string_copy in srfi-13.c for a
1312 rationale.
1313 */
1314
1315 size_t from, to;
3ee86942
MV
1316
1317 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1318 scm_i_get_substring_spec (scm_i_string_length (str),
1319 start, &from, end, &to);
3ee86942
MV
1320 return scm_i_substring_copy (str, from, to);
1321}
1322#undef FUNC_NAME
1323
1324SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1325 (SCM str, SCM start, SCM end),
1326 "Return string that indirectly refers to the characters\n"
1327 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1328 "ending with index @var{end} (exclusive).\n"
1329 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1330 "exact integers satisfying:\n\n"
1331 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1332#define FUNC_NAME s_scm_substring_shared
1333{
1334 size_t len, from, to;
1335
1336 SCM_VALIDATE_STRING (1, str);
1337 len = scm_i_string_length (str);
1338 from = scm_to_unsigned_integer (start, 0, len);
1339 if (SCM_UNBNDP (end))
1340 to = len;
1341 else
1342 to = scm_to_unsigned_integer (end, from, len);
1343 return scm_i_substring_shared (str, from, to);
1344}
1345#undef FUNC_NAME
685c0d71 1346
3b3b36dd 1347SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1348 (SCM args),
9c44cd45 1349 "Return a newly allocated string whose characters form the\n"
0d26a824 1350 "concatenation of the given strings, @var{args}.")
1bbd0b84 1351#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1352{
1353 SCM res;
9c44cd45
MG
1354 size_t len = 0;
1355 int wide = 0;
c829a427 1356 SCM l, s;
bd4911ef 1357 size_t i;
9909c395
MG
1358 union
1359 {
1360 char *narrow;
1361 scm_t_wchar *wide;
1362 } data;
af45e3b0
DH
1363
1364 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1365 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1366 {
1367 s = SCM_CAR (l);
1368 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1369 len += scm_i_string_length (s);
1370 if (!scm_i_is_narrow_string (s))
1371 wide = 1;
c829a427 1372 }
9909c395 1373 data.narrow = NULL;
9c44cd45 1374 if (!wide)
9909c395 1375 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1376 else
9909c395 1377 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1378
1379 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1380 {
edea856c 1381 size_t len;
c829a427 1382 s = SCM_CAR (l);
3ee86942 1383 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1384 len = scm_i_string_length (s);
9c44cd45
MG
1385 if (!wide)
1386 {
9909c395
MG
1387 memcpy (data.narrow, scm_i_string_chars (s), len);
1388 data.narrow += len;
9c44cd45
MG
1389 }
1390 else
1391 {
1392 if (scm_i_is_narrow_string (s))
1393 {
1394 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1395 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1396 }
1397 else
9909c395 1398 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1399 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1400 data.wide += len;
9c44cd45 1401 }
c829a427
MV
1402 scm_remember_upto_here_1 (s);
1403 }
0f2d19dd
JB
1404 return res;
1405}
1bbd0b84 1406#undef FUNC_NAME
0f2d19dd 1407
24933780 1408
a3d7d5d5
LC
1409\f
1410/* Conversion to/from other encodings. */
1411
1412SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
d14418a5 1413void
ef7e4ba3
LC
1414scm_encoding_error (const char *subr, int err, const char *message,
1415 const char *from, const char *to, SCM string_or_bv)
1416{
1417 /* Raise an exception that conveys all the information needed to debug the
1418 problem. Only perform locale conversions that are safe; in particular,
1419 don't try to display STRING_OR_BV when it's a string since converting it to
1420 the output locale may fail. */
1421 scm_throw (scm_encoding_error_key,
1422 scm_list_n (scm_from_locale_string (subr),
1423 scm_from_locale_string (message),
1424 scm_from_int (err),
1425 scm_from_locale_string (from),
1426 scm_from_locale_string (to),
1427 string_or_bv,
1428 SCM_UNDEFINED));
a3d7d5d5
LC
1429}
1430
fac32b51 1431SCM
587a3355
MG
1432scm_from_stringn (const char *str, size_t len, const char *encoding,
1433 scm_t_string_failed_conversion_handler handler)
1434{
1435 size_t u32len, i;
1436 scm_t_wchar *u32;
1437 int wide = 0;
1438 SCM res;
1439
d40e1ca8 1440 /* The order of these checks is important. */
a574564c 1441 if (!str && len != 0)
d40e1ca8
AW
1442 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1443 if (len == (size_t) -1)
1444 len = strlen (str);
a574564c
AW
1445 if (len == 0)
1446 return scm_nullstr;
fac32b51 1447
889975e5
MG
1448 if (encoding == NULL)
1449 {
1450 /* If encoding is null, use Latin-1. */
1451 char *buf;
1452 res = scm_i_make_string (len, &buf);
1453 memcpy (buf, str, len);
1454 return res;
1455 }
1456
587a3355
MG
1457 u32len = 0;
1458 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1459 (enum iconv_ilseq_handler)
1460 handler,
1461 str, len,
1462 NULL,
1463 NULL, &u32len);
1464
ef7e4ba3 1465 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1466 {
ef7e4ba3
LC
1467 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1468 handler. */
1469 SCM bv;
1470 signed char *buf;
1471
1472 buf = scm_gc_malloc_pointerless (len, "bytevector");
1473 memcpy (buf, str, len);
1474 bv = scm_c_take_bytevector (buf, len);
1475
1476 scm_encoding_error (__func__, errno,
1477 "input locale conversion error",
1478 encoding, "UTF-32", bv);
587a3355
MG
1479 }
1480
1481 i = 0;
1482 while (i < u32len)
1483 if (u32[i++] > 0xFF)
1484 {
1485 wide = 1;
1486 break;
1487 }
1488
1489 if (!wide)
1490 {
1491 char *dst;
1492 res = scm_i_make_string (u32len, &dst);
1493 for (i = 0; i < u32len; i ++)
1494 dst[i] = (unsigned char) u32[i];
1495 dst[u32len] = '\0';
1496 }
1497 else
1498 {
1499 scm_t_wchar *wdst;
1500 res = scm_i_make_wide_string (u32len, &wdst);
1501 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1502 wdst[u32len] = 0;
1503 }
1504
1505 free (u32);
1506 return res;
1507}
1508
cf313a94 1509SCM
d40e1ca8 1510scm_from_locale_string (const char *str)
cf313a94 1511{
d40e1ca8 1512 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1513}
1514
c829a427
MV
1515SCM
1516scm_from_locale_stringn (const char *str, size_t len)
1517{
889975e5
MG
1518 const char *enc;
1519 scm_t_string_failed_conversion_handler hndl;
1520 SCM inport;
1521 scm_t_port *pt;
4d4528e7 1522
889975e5
MG
1523 inport = scm_current_input_port ();
1524 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1525 {
1526 pt = SCM_PTAB_ENTRY (inport);
1527 enc = pt->encoding;
1528 hndl = pt->ilseq_handler;
1529 }
1530 else
1531 {
1532 enc = NULL;
1533 hndl = SCM_FAILED_CONVERSION_ERROR;
1534 }
1535
1536 return scm_from_stringn (str, len, enc, hndl);
c829a427 1537}
4d4528e7 1538
c829a427 1539SCM
d40e1ca8 1540scm_from_latin1_string (const char *str)
4d4528e7 1541{
d40e1ca8
AW
1542 return scm_from_latin1_stringn (str, -1);
1543}
9c44cd45 1544
d40e1ca8
AW
1545SCM
1546scm_from_latin1_stringn (const char *str, size_t len)
1547{
1548 return scm_from_stringn (str, len, NULL, SCM_FAILED_CONVERSION_ERROR);
c829a427 1549}
4d4528e7 1550
587a3355 1551SCM
d40e1ca8 1552scm_from_utf8_string (const char *str)
587a3355 1553{
d40e1ca8
AW
1554 return scm_from_utf8_stringn (str, -1);
1555}
1556
1557SCM
1558scm_from_utf8_stringn (const char *str, size_t len)
1559{
1560 return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
587a3355
MG
1561}
1562
50b1996f
MG
1563/* Create a new scheme string from the C string STR. The memory of
1564 STR may be used directly as storage for the new string. */
13a94556
LC
1565/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1566 would be to register a finalizer to eventually free(3) STR, which isn't
1567 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1568SCM
1569scm_take_locale_stringn (char *str, size_t len)
1570{
13a94556 1571 SCM res;
48ddf0d9 1572
13a94556
LC
1573 res = scm_from_locale_stringn (str, len);
1574 free (str);
c829a427 1575
c829a427
MV
1576 return res;
1577}
1578
48ddf0d9
KR
1579SCM
1580scm_take_locale_string (char *str)
1581{
1582 return scm_take_locale_stringn (str, -1);
1583}
1584
f1ee6d54
LC
1585/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1586 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1587 Set *LENP to the size of the resulting string.
1588
1589 FIXME: This is a hack we should get rid of. See
1590 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1591 for details. */
1592static void
1593unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1594{
1595 char *before, *after;
1596 size_t i, j;
1597
4ff2b9f4
LC
1598 before = buf;
1599 after = buf;
9c44cd45
MG
1600 i = 0;
1601 j = 0;
1602 while (i < *lenp)
1603 {
1604 if ((i <= *lenp - 6)
1605 && before[i] == '\\'
1606 && before[i + 1] == 'u'
1607 && before[i + 2] == '0' && before[i + 3] == '0')
1608 {
1609 /* Convert \u00NN to \xNN */
1610 after[j] = '\\';
1611 after[j + 1] = 'x';
30a6b9ca
MG
1612 after[j + 2] = tolower ((int) before[i + 4]);
1613 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1614 i += 6;
1615 j += 4;
1616 }
1617 else if ((i <= *lenp - 10)
1618 && before[i] == '\\'
1619 && before[i + 1] == 'U'
1620 && before[i + 2] == '0' && before[i + 3] == '0')
1621 {
1622 /* Convert \U00NNNNNN to \UNNNNNN */
1623 after[j] = '\\';
1624 after[j + 1] = 'U';
30a6b9ca
MG
1625 after[j + 2] = tolower ((int) before[i + 4]);
1626 after[j + 3] = tolower ((int) before[i + 5]);
1627 after[j + 4] = tolower ((int) before[i + 6]);
1628 after[j + 5] = tolower ((int) before[i + 7]);
1629 after[j + 6] = tolower ((int) before[i + 8]);
1630 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1631 i += 10;
1632 j += 8;
1633 }
1634 else
1635 {
1636 after[j] = before[i];
1637 i++;
1638 j++;
1639 }
1640 }
1641 *lenp = j;
9c44cd45
MG
1642}
1643
f1ee6d54
LC
1644/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1645 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1646 of the resulting string. BUF must be large enough to handle the
1647 worst case when `\uXXXX' escapes (6 characters) are replaced by
1648 `\xXXXX;' (7 characters). */
31d4d02b
LC
1649static void
1650unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1651{
1652 char *before, *after;
1653 size_t i, j;
1654 /* The worst case is if the input string contains all 4-digit hex escapes.
1655 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1656 size_t max_out_len = (*lenp * 7) / 6 + 1;
1657 size_t nzeros, ndigits;
1658
4ff2b9f4 1659 before = buf;
d31b9519
MG
1660 after = alloca (max_out_len);
1661 i = 0;
1662 j = 0;
1663 while (i < *lenp)
1664 {
1665 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1666 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1667 {
1668 if (before[i + 1] == 'u')
1669 ndigits = 4;
1670 else if (before[i + 1] == 'U')
1671 ndigits = 8;
1672 else
1673 abort ();
1674
1675 /* Add the R6RS hex escape initial sequence. */
1676 after[j] = '\\';
1677 after[j + 1] = 'x';
1678
1679 /* Move string positions to the start of the hex numbers. */
1680 i += 2;
1681 j += 2;
1682
1683 /* Find the number of initial zeros in this hex number. */
1684 nzeros = 0;
1685 while (before[i + nzeros] == '0' && nzeros < ndigits)
1686 nzeros++;
1687
1688 /* Copy the number, skipping initial zeros, and then move the string
1689 positions. */
1690 if (nzeros == ndigits)
1691 {
1692 after[j] = '0';
1693 i += ndigits;
1694 j += 1;
1695 }
1696 else
1697 {
1698 int pos;
1699 for (pos = 0; pos < ndigits - nzeros; pos++)
1700 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1701 i += ndigits;
1702 j += (ndigits - nzeros);
1703 }
1704
1705 /* Add terminating semicolon. */
1706 after[j] = ';';
1707 j++;
1708 }
1709 else
1710 {
1711 after[j] = before[i];
1712 i++;
1713 j++;
1714 }
1715 }
1716 *lenp = j;
d31b9519
MG
1717 memcpy (before, after, j);
1718}
1719
cf313a94 1720char *
d40e1ca8 1721scm_to_locale_string (SCM str)
cf313a94 1722{
d40e1ca8 1723 return scm_to_locale_stringn (str, NULL);
cf313a94 1724}
d31b9519 1725
c829a427 1726char *
fac32b51 1727scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1728{
889975e5
MG
1729 SCM outport;
1730 scm_t_port *pt;
9c44cd45
MG
1731 const char *enc;
1732
889975e5
MG
1733 outport = scm_current_output_port ();
1734 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1735 {
1736 pt = SCM_PTAB_ENTRY (outport);
1737 enc = pt->encoding;
1738 }
1739 else
1740 enc = NULL;
9c44cd45 1741
889975e5
MG
1742 return scm_to_stringn (str, lenp,
1743 enc,
1744 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1745}
1746
d40e1ca8
AW
1747char *
1748scm_to_latin1_string (SCM str)
1749{
1750 return scm_to_latin1_stringn (str, NULL);
1751}
1752
1753char *
1754scm_to_latin1_stringn (SCM str, size_t *lenp)
1755{
1756 return scm_to_stringn (str, lenp, NULL, SCM_FAILED_CONVERSION_ERROR);
1757}
1758
1759char *
1760scm_to_utf8_string (SCM str)
1761{
1762 return scm_to_utf8_stringn (str, NULL);
1763}
1764
1765char *
1766scm_to_utf8_stringn (SCM str, size_t *lenp)
1767{
1768 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1769}
1770
29bcdbb0
LC
1771/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1772 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1773 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1774 defined by HANDLER. */
9c44cd45 1775char *
587a3355 1776scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1777 scm_t_string_failed_conversion_handler handler)
9c44cd45 1778{
9c44cd45
MG
1779 char *buf;
1780 size_t ilen, len, i;
889975e5
MG
1781 int ret;
1782 const char *enc;
4d4528e7 1783
3ee86942 1784 if (!scm_is_string (str))
c829a427 1785 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1786 ilen = scm_i_string_length (str);
1787
1788 if (ilen == 0)
1789 {
1790 buf = scm_malloc (1);
1791 buf[0] = '\0';
1792 if (lenp)
1793 *lenp = 0;
1794 return buf;
1795 }
587a3355 1796
c829a427 1797 if (lenp == NULL)
9c44cd45
MG
1798 for (i = 0; i < ilen; i++)
1799 if (scm_i_string_ref (str, i) == '\0')
1800 scm_misc_error (NULL,
1801 "string contains #\\nul character: ~S",
1802 scm_list_1 (str));
1803
889975e5 1804 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1805 {
889975e5
MG
1806 /* If using native Latin-1 encoding, just copy the string
1807 contents. */
9c44cd45
MG
1808 if (lenp)
1809 {
1810 buf = scm_malloc (ilen);
1811 memcpy (buf, scm_i_string_chars (str), ilen);
1812 *lenp = ilen;
1813 return buf;
1814 }
1815 else
1816 {
1817 buf = scm_malloc (ilen + 1);
1818 memcpy (buf, scm_i_string_chars (str), ilen);
1819 buf[ilen] = '\0';
1820 return buf;
1821 }
c829a427 1822 }
9c44cd45 1823
587a3355 1824
9c44cd45
MG
1825 buf = NULL;
1826 len = 0;
889975e5
MG
1827 enc = encoding;
1828 if (enc == NULL)
1829 enc = "ISO-8859-1";
1830 if (scm_i_is_narrow_string (str))
1831 {
1832 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1833 "ISO-8859-1", enc,
1834 (enum iconv_ilseq_handler) handler, NULL,
1835 &buf, &len);
9c44cd45 1836
889975e5 1837 if (ret != 0)
ef7e4ba3
LC
1838 scm_encoding_error (__func__, errno,
1839 "cannot convert to output locale",
1840 "ISO-8859-1", enc, str);
889975e5
MG
1841 }
1842 else
1843 {
d31b9519 1844 buf = u32_conv_to_encoding (enc,
889975e5 1845 (enum iconv_ilseq_handler) handler,
d31b9519 1846 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
1847 ilen,
1848 NULL,
1849 NULL, &len);
1850 if (buf == NULL)
ef7e4ba3
LC
1851 scm_encoding_error (__func__, errno,
1852 "cannot convert to output locale",
1853 "UTF-32", enc, str);
d31b9519
MG
1854 }
1855 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1856 {
1857 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
1858 {
1859 /* The worst case is if the input string contains all 4-digit
1860 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1861 (seven characters). Make BUF large enough to hold
1862 that. */
1863 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 1864 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 1865 }
d31b9519 1866 else
31d4d02b 1867 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
1868
1869 buf = scm_realloc (buf, len);
889975e5 1870 }
9c44cd45 1871 if (lenp)
4d4528e7 1872 *lenp = len;
9c44cd45
MG
1873 else
1874 {
1875 buf = scm_realloc (buf, len + 1);
1876 buf[len] = '\0';
1877 }
24933780 1878
c829a427 1879 scm_remember_upto_here_1 (str);
9c44cd45 1880 return buf;
4d4528e7 1881}
af68e5e5 1882
c829a427
MV
1883size_t
1884scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1885{
1886 size_t len;
9c44cd45 1887 char *result = NULL;
3ee86942 1888 if (!scm_is_string (str))
c829a427 1889 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1890 result = scm_to_locale_stringn (str, &len);
1891
1892 memcpy (buf, result, (len > max_len) ? max_len : len);
1893 free (result);
1894
c829a427
MV
1895 scm_remember_upto_here_1 (str);
1896 return len;
1897}
af68e5e5 1898
a3d7d5d5
LC
1899\f
1900/* Unicode string normalization. */
1901
edb7bb47
JG
1902/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1903 libguile/i18n.c. It would be useful to have this factored out into a more
1904 convenient location, but its use of alloca makes that tricky to do. */
1905
1906static SCM
1907normalize_str (SCM string, uninorm_t form)
1908{
1909 SCM ret;
1910 scm_t_uint32 *w_str;
1911 scm_t_wchar *cbuf;
1912 size_t rlen, len = scm_i_string_length (string);
1913
1914 if (scm_i_is_narrow_string (string))
1915 {
1916 size_t i;
1917 const char *buf = scm_i_string_chars (string);
1918
1919 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1920
1921 for (i = 0; i < len; i ++)
1922 w_str[i] = (unsigned char) buf[i];
1923 w_str[len] = 0;
1924 }
d8164b04
JG
1925 else
1926 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1927
edb7bb47
JG
1928 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1929
1930 ret = scm_i_make_wide_string (rlen, &cbuf);
1931 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1932 free (w_str);
d8164b04
JG
1933
1934 scm_i_try_narrow_string (ret);
1935
edb7bb47
JG
1936 return ret;
1937}
1938
1939SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1940 (SCM string),
1941 "Returns the NFC normalized form of @var{string}.")
1942#define FUNC_NAME s_scm_string_normalize_nfc
1943{
1944 SCM_VALIDATE_STRING (1, string);
1945 return normalize_str (string, UNINORM_NFC);
1946}
1947#undef FUNC_NAME
1948
1949SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1950 (SCM string),
1951 "Returns the NFD normalized form of @var{string}.")
1952#define FUNC_NAME s_scm_string_normalize_nfd
1953{
1954 SCM_VALIDATE_STRING (1, string);
1955 return normalize_str (string, UNINORM_NFD);
1956}
1957#undef FUNC_NAME
1958
1959SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1960 (SCM string),
1961 "Returns the NFKC normalized form of @var{string}.")
1962#define FUNC_NAME s_scm_string_normalize_nfkc
1963{
1964 SCM_VALIDATE_STRING (1, string);
1965 return normalize_str (string, UNINORM_NFKC);
1966}
1967#undef FUNC_NAME
1968
1969SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1970 (SCM string),
1971 "Returns the NFKD normalized form of @var{string}.")
1972#define FUNC_NAME s_scm_string_normalize_nfkd
1973{
1974 SCM_VALIDATE_STRING (1, string);
1975 return normalize_str (string, UNINORM_NFKD);
1976}
1977#undef FUNC_NAME
1978
3ee86942
MV
1979/* converts C scm_array of strings to SCM scm_list of strings. */
1980/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1981SCM
3ee86942
MV
1982scm_makfromstrs (int argc, char **argv)
1983{
1984 int i = argc;
1985 SCM lst = SCM_EOL;
1986 if (0 > i)
1987 for (i = 0; argv[i]; i++);
1988 while (i--)
1989 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1990 return lst;
1991}
1992
c829a427
MV
1993/* Return a newly allocated array of char pointers to each of the strings
1994 in args, with a terminating NULL pointer. */
1995
1996char **
1997scm_i_allocate_string_pointers (SCM list)
2a776823 1998#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 1999{
c829a427
MV
2000 char **result;
2001 int len = scm_ilength (list);
2002 int i;
2003
2004 if (len < 0)
2005 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2006
2a776823
LC
2007 result = scm_gc_malloc ((len + 1) * sizeof (char *),
2008 "string pointers");
c829a427 2009 result[len] = NULL;
c829a427
MV
2010
2011 /* The list might be have been modified in another thread, so
2012 we check LIST before each access.
2013 */
d2e53ed6 2014 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427 2015 {
2a776823
LC
2016 SCM str;
2017 size_t len;
2018
2019 str = SCM_CAR (list);
2020 len = scm_c_string_length (str);
2021
2022 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
2023 memcpy (result[i], scm_i_string_chars (str), len);
2024 result[i][len] = '\0';
2025
c829a427
MV
2026 list = SCM_CDR (list);
2027 }
2028
c829a427 2029 return result;
af68e5e5 2030}
2a776823 2031#undef FUNC_NAME
24933780 2032
6f14f578
MV
2033void
2034scm_i_get_substring_spec (size_t len,
2035 SCM start, size_t *cstart,
2036 SCM end, size_t *cend)
2037{
2038 if (SCM_UNBNDP (start))
2039 *cstart = 0;
2040 else
2041 *cstart = scm_to_unsigned_integer (start, 0, len);
2042
2043 if (SCM_UNBNDP (end))
2044 *cend = len;
2045 else
2046 *cend = scm_to_unsigned_integer (end, *cstart, len);
2047}
2048
3ee86942
MV
2049#if SCM_ENABLE_DEPRECATED
2050
556d75db
MV
2051/* When these definitions are removed, it becomes reasonable to use
2052 read-only strings for string literals. For that, change the reader
2053 to create string literals with scm_c_substring_read_only instead of
2054 with scm_c_substring_copy.
2055*/
2056
3ee86942 2057int
fe78c51a 2058scm_i_deprecated_stringp (SCM str)
3ee86942
MV
2059{
2060 scm_c_issue_deprecation_warning
2061 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2062
2616f0e0 2063 return scm_is_string (str);
3ee86942
MV
2064}
2065
2066char *
fe78c51a 2067scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2068{
2069 char *chars;
2070
2071 scm_c_issue_deprecation_warning
2072 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2073
2616f0e0
MV
2074 /* We don't accept shared substrings here since they are not
2075 null-terminated.
2076 */
2077 if (IS_SH_STRING (str))
c291b588
LC
2078 scm_misc_error (NULL,
2079 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2080 SCM_EOL);
2081
877f06c3 2082 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2083 error message.
2084 */
2085
2086 if (IS_RO_STRING (str))
c291b588
LC
2087 scm_misc_error (NULL,
2088 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2089 SCM_EOL);
c291b588 2090
2616f0e0 2091 /* The following is still wrong, of course...
3ee86942 2092 */
9c44cd45 2093 str = scm_i_string_start_writing (str);
3ee86942
MV
2094 chars = scm_i_string_writable_chars (str);
2095 scm_i_string_stop_writing ();
2096 return chars;
2097}
2098
2099size_t
fe78c51a 2100scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2101{
2102 scm_c_issue_deprecation_warning
2103 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2104 return scm_c_string_length (str);
2105}
2106
2107#endif
2108
2a610be5
AW
2109static SCM
2110string_handle_ref (scm_t_array_handle *h, size_t index)
2111{
2112 return scm_c_string_ref (h->array, index);
2113}
2114
2115static void
2116string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2117{
2118 scm_c_string_set_x (h->array, index, val);
2119}
2120
2121static void
2122string_get_handle (SCM v, scm_t_array_handle *h)
2123{
2124 h->array = v;
2125 h->ndims = 1;
2126 h->dims = &h->dim0;
2127 h->dim0.lbnd = 0;
2128 h->dim0.ubnd = scm_c_string_length (v) - 1;
2129 h->dim0.inc = 1;
2130 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2131 h->elements = h->writable_elements = NULL;
2132}
2133
c5f17102 2134SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2135 string_handle_ref, string_handle_set,
f65e0168
LC
2136 string_get_handle)
2137SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2138
0f2d19dd
JB
2139void
2140scm_init_strings ()
0f2d19dd 2141{
3ee86942 2142 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 2143
a0599745 2144#include "libguile/strings.x"
0f2d19dd
JB
2145}
2146
89e00824
ML
2147
2148/*
2149 Local Variables:
2150 c-file-style: "gnu"
2151 End:
2152*/