read-header returns EOF at end, update (web http) docs
[bpt/guile.git] / libguile / strings.c
CommitLineData
29bcdbb0 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
faf2c9d7 32
889975e5
MG
33#include "striconveh.h"
34
a0599745
MD
35#include "libguile/_scm.h"
36#include "libguile/chars.h"
7c33806a 37#include "libguile/root.h"
a0599745 38#include "libguile/strings.h"
a3d7d5d5 39#include "libguile/error.h"
f45eccff 40#include "libguile/generalized-vectors.h"
1afff620 41#include "libguile/deprecation.h"
a0599745 42#include "libguile/validate.h"
d31b9519 43#include "libguile/private-options.h"
1afff620 44
0f2d19dd
JB
45\f
46
47/* {Strings}
48 */
49
3ee86942
MV
50
51/* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 55 * shared, not whether it definitely is.
3ee86942
MV
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
50b1996f 68 *
ba54a202
LC
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
3ee86942
MV
72 */
73
ba54a202
LC
74/* The size in words of the stringbuf header (type tag + size). */
75#define STRINGBUF_HEADER_SIZE 2U
76
77#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
35920c00 79#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 80#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
81
82#define STRINGBUF_TAG scm_tc7_stringbuf
83#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 84#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 85
100e20c7 86#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
100e20c7
LC
89#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 91
100e20c7 92#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 93
9b41542f
LC
94#define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
3ee86942 103
56a3dcd4 104#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
105static size_t lenhist[1001];
106#endif
107
50b1996f
MG
108/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
3ee86942
MV
110static SCM
111make_stringbuf (size_t len)
0f2d19dd 112{
3ee86942
MV
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
ba54a202
LC
120 SCM buf;
121
56a3dcd4 122#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127#endif
0f2d19dd 128
ba54a202
LC
129 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
3ee86942 138}
e53cc817 139
50b1996f
MG
140/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
9c44cd45
MG
142static SCM
143make_wide_stringbuf (size_t len)
144{
ba54a202
LC
145 SCM buf;
146 size_t raw_len;
147
56a3dcd4 148#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153#endif
154
ba54a202
LC
155 raw_len = (len + 1) * sizeof (scm_t_wchar);
156 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
9c44cd45
MG
165}
166
ba54a202
LC
167/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169static SCM
170wide_stringbuf (SCM buf)
9c44cd45 171{
ba54a202 172 SCM new_buf;
9c44cd45
MG
173
174 if (STRINGBUF_WIDE (buf))
ba54a202
LC
175 new_buf = buf;
176 else
9c44cd45 177 {
ba54a202
LC
178 size_t i, len;
179 scm_t_wchar *mem;
9c44cd45 180
ba54a202 181 len = STRINGBUF_LENGTH (buf);
9c44cd45 182
ba54a202 183 new_buf = make_wide_stringbuf (len);
9c44cd45 184
ba54a202 185 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 186 for (i = 0; i < len; i++)
ba54a202 187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 188 mem[len] = 0;
9c44cd45 189 }
ba54a202
LC
190
191 return new_buf;
3ee86942 192}
bd9e24b3 193
ba54a202
LC
194/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196static SCM
587a3355
MG
197narrow_stringbuf (SCM buf)
198{
ba54a202 199 SCM new_buf;
587a3355
MG
200
201 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
587a3355 208
ba54a202
LC
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 211
ba54a202
LC
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
587a3355 216
ba54a202 217 new_buf = make_stringbuf (len);
587a3355 218
ba54a202
LC
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
587a3355
MG
226}
227
9de87eea 228scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 229
ba54a202 230\f
3ee86942
MV
231/* Copy-on-write strings.
232 */
bd9e24b3 233
3ee86942 234#define STRING_TAG scm_tc7_string
bd9e24b3 235
3ee86942
MV
236#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 239
3ee86942
MV
240#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
243#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
244
ed35de72
MV
245/* Read-only strings.
246 */
247
35920c00 248#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
249#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
e1b29f6a
MV
251/* Mutation-sharing substrings
252 */
253
254#define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257/* START and LENGTH as for STRINGs. */
258
259#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
e7efe8e7
AW
261SCM scm_nullstr;
262
50b1996f
MG
263/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. */
3ee86942
MV
266SCM
267scm_i_make_string (size_t len, char **charsp)
268{
269 SCM buf = make_stringbuf (len);
270 SCM res;
271 if (charsp)
f59cf998 272 *charsp = (char *) STRINGBUF_CHARS (buf);
3ee86942
MV
273 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
274 (scm_t_bits)0, (scm_t_bits) len);
275 return res;
0f2d19dd
JB
276}
277
50b1996f
MG
278/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
279 characters. CHARSP, if not NULL, will be set to location of the
280 character array. */
9c44cd45 281SCM
50b1996f 282scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
283{
284 SCM buf = make_wide_stringbuf (len);
285 SCM res;
286 if (charsp)
287 *charsp = STRINGBUF_WIDE_CHARS (buf);
288 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
289 (scm_t_bits) 0, (scm_t_bits) len);
290 return res;
291}
292
3ee86942
MV
293static void
294validate_substring_args (SCM str, size_t start, size_t end)
295{
296 if (!IS_STRING (str))
297 scm_wrong_type_arg_msg (NULL, 0, str, "string");
298 if (start > STRING_LENGTH (str))
299 scm_out_of_range (NULL, scm_from_size_t (start));
300 if (end > STRING_LENGTH (str) || end < start)
301 scm_out_of_range (NULL, scm_from_size_t (end));
302}
0f2d19dd 303
e1b29f6a
MV
304static inline void
305get_str_buf_start (SCM *str, SCM *buf, size_t *start)
306{
307 *start = STRING_START (*str);
308 if (IS_SH_STRING (*str))
309 {
310 *str = SH_STRING_STRING (*str);
311 *start += STRING_START (*str);
312 }
313 *buf = STRING_STRINGBUF (*str);
314}
315
3ee86942
MV
316SCM
317scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 318{
e1b29f6a
MV
319 SCM buf;
320 size_t str_start;
321 get_str_buf_start (&str, &buf, &str_start);
9de87eea 322 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 323 SET_STRINGBUF_SHARED (buf);
9de87eea 324 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 325 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
326 (scm_t_bits)str_start + start,
327 (scm_t_bits) end - start);
0f2d19dd
JB
328}
329
ed35de72
MV
330SCM
331scm_i_substring_read_only (SCM str, size_t start, size_t end)
332{
45a9f430
LC
333 SCM buf;
334 size_t str_start;
335 get_str_buf_start (&str, &buf, &str_start);
336 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
337 SET_STRINGBUF_SHARED (buf);
338 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
339 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
340 (scm_t_bits)str_start + start,
341 (scm_t_bits) end - start);
ed35de72
MV
342}
343
3ee86942
MV
344SCM
345scm_i_substring_copy (SCM str, size_t start, size_t end)
346{
347 size_t len = end - start;
edea856c 348 SCM buf, my_buf;
e1b29f6a
MV
349 size_t str_start;
350 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
351 if (scm_i_is_narrow_string (str))
352 {
353 my_buf = make_stringbuf (len);
354 memcpy (STRINGBUF_CHARS (my_buf),
355 STRINGBUF_CHARS (buf) + str_start + start, len);
356 }
357 else
358 {
359 my_buf = make_wide_stringbuf (len);
360 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
361 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
362 + start), len);
363 /* Even though this string is wide, the substring may be narrow.
50b1996f 364 Consider adding code to narrow the string. */
9c44cd45 365 }
3ee86942 366 scm_remember_upto_here_1 (buf);
9c44cd45
MG
367 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
368 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 369}
0f2d19dd 370
e1b29f6a
MV
371SCM
372scm_i_substring_shared (SCM str, size_t start, size_t end)
373{
374 if (start == 0 && end == STRING_LENGTH (str))
375 return str;
376 else
377 {
378 size_t len = end - start;
379 if (IS_SH_STRING (str))
380 {
381 start += STRING_START (str);
382 str = SH_STRING_STRING (str);
383 }
384 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
385 (scm_t_bits)start, (scm_t_bits) len);
386 }
387}
388
3ee86942
MV
389SCM
390scm_c_substring (SCM str, size_t start, size_t end)
391{
392 validate_substring_args (str, start, end);
393 return scm_i_substring (str, start, end);
394}
ee149d03 395
ed35de72
MV
396SCM
397scm_c_substring_read_only (SCM str, size_t start, size_t end)
398{
399 validate_substring_args (str, start, end);
400 return scm_i_substring_read_only (str, start, end);
401}
402
0f2d19dd 403SCM
3ee86942 404scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 405{
3ee86942
MV
406 validate_substring_args (str, start, end);
407 return scm_i_substring_copy (str, start, end);
408}
409
3ee86942
MV
410SCM
411scm_c_substring_shared (SCM str, size_t start, size_t end)
412{
413 validate_substring_args (str, start, end);
414 return scm_i_substring_shared (str, start, end);
415}
0f2d19dd 416
d6c74168 417\f
3ee86942
MV
418/* Internal accessors
419 */
420
50b1996f
MG
421/* Returns the number of characters in STR. This may be different
422 than the memory size of the string storage. */
3ee86942
MV
423size_t
424scm_i_string_length (SCM str)
0f2d19dd 425{
3ee86942 426 return STRING_LENGTH (str);
0f2d19dd
JB
427}
428
50b1996f
MG
429/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
430 encoding. False if it is 'wide', having a 32-bit UCS-4
431 encoding. */
9c44cd45
MG
432int
433scm_i_is_narrow_string (SCM str)
434{
435 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
436}
437
587a3355
MG
438/* Try to coerce a string to be narrow. It if is narrow already, do
439 nothing. If it is wide, shrink it to narrow if none of its
440 characters are above 0xFF. Return true if the string is narrow or
441 was made to be narrow. */
442int
443scm_i_try_narrow_string (SCM str)
444{
ba54a202 445 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
446
447 return scm_i_is_narrow_string (str);
448}
449
100e20c7
LC
450/* Return a pointer to the raw data of the string, which can be either Latin-1
451 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
452const void *
453scm_i_string_data (SCM str)
454{
455 SCM buf;
456 size_t start;
457 const char *data;
458
459 get_str_buf_start (&str, &buf, &start);
460
461 data = STRINGBUF_CONTENTS (buf);
462 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
463
464 return data;
465}
466
50b1996f
MG
467/* Returns a pointer to the 8-bit Latin-1 encoded character array of
468 STR. */
3ee86942
MV
469const char *
470scm_i_string_chars (SCM str)
471{
472 SCM buf;
e1b29f6a
MV
473 size_t start;
474 get_str_buf_start (&str, &buf, &start);
9c44cd45 475 if (scm_i_is_narrow_string (str))
f59cf998 476 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
477 else
478 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
479 scm_list_1 (str));
480 return NULL;
3ee86942 481}
b00418df 482
50b1996f
MG
483/* Returns a pointer to the 32-bit UCS-4 encoded character array of
484 STR. */
9c44cd45
MG
485const scm_t_wchar *
486scm_i_string_wide_chars (SCM str)
487{
488 SCM buf;
489 size_t start;
490
491 get_str_buf_start (&str, &buf, &start);
492 if (!scm_i_is_narrow_string (str))
f59cf998 493 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
494 else
495 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
496 scm_list_1 (str));
497}
498
499/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
500 a new string buffer, so that it can be modified without modifying
50b1996f
MG
501 other strings. Also, lock the string mutex. Later, one must call
502 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
503SCM
504scm_i_string_start_writing (SCM orig_str)
b00418df 505{
ed35de72 506 SCM buf, str = orig_str;
e1b29f6a 507 size_t start;
ed35de72 508
e1b29f6a 509 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
510 if (IS_RO_STRING (str))
511 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
512
9de87eea 513 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
514 if (STRINGBUF_SHARED (buf))
515 {
9c44cd45 516 /* Clone the stringbuf. */
3ee86942
MV
517 size_t len = STRING_LENGTH (str);
518 SCM new_buf;
519
9de87eea 520 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 521
9c44cd45
MG
522 if (scm_i_is_narrow_string (str))
523 {
524 new_buf = make_stringbuf (len);
525 memcpy (STRINGBUF_CHARS (new_buf),
526 STRINGBUF_CHARS (buf) + STRING_START (str), len);
527
528 }
529 else
530 {
531 new_buf = make_wide_stringbuf (len);
532 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
533 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
534 + STRING_START (str)), len);
535 }
3ee86942 536
3ee86942
MV
537 SET_STRING_STRINGBUF (str, new_buf);
538 start -= STRING_START (str);
902578f1
LC
539
540 /* FIXME: The following operations are not atomic, so other threads
541 looking at STR may see an inconsistent state. Nevertheless it can't
542 hurt much since (i) accessing STR while it is being mutated can't
543 yield a crash, and (ii) concurrent accesses to STR should be
544 protected by a mutex at the application level. The latter may not
545 apply when STR != ORIG_STR, though. */
3ee86942 546 SET_STRING_START (str, 0);
902578f1 547 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
548
549 buf = new_buf;
550
9de87eea 551 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 552 }
9c44cd45
MG
553 return orig_str;
554}
3ee86942 555
50b1996f 556/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
557char *
558scm_i_string_writable_chars (SCM str)
559{
560 SCM buf;
561 size_t start;
3ee86942 562
9c44cd45
MG
563 get_str_buf_start (&str, &buf, &start);
564 if (scm_i_is_narrow_string (str))
f59cf998 565 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
566 else
567 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
568 scm_list_1 (str));
569 return NULL;
b00418df
DH
570}
571
50b1996f 572/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
573static scm_t_wchar *
574scm_i_string_writable_wide_chars (SCM str)
575{
576 SCM buf;
577 size_t start;
578
579 get_str_buf_start (&str, &buf, &start);
580 if (!scm_i_is_narrow_string (str))
581 return STRINGBUF_WIDE_CHARS (buf) + start;
582 else
1c7b216f 583 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 584 scm_list_1 (str));
b00418df
DH
585}
586
50b1996f
MG
587/* Unlock the string mutex that was locked when
588 scm_i_string_start_writing was called. */
3ee86942
MV
589void
590scm_i_string_stop_writing (void)
591{
9de87eea 592 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 593}
b00418df 594
50b1996f 595/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
596scm_t_wchar
597scm_i_string_ref (SCM str, size_t x)
598{
599 if (scm_i_is_narrow_string (str))
600 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
601 else
602 return scm_i_string_wide_chars (str)[x];
603}
604
889975e5
MG
605/* Returns index+1 of the first char in STR that matches C, or
606 0 if the char is not found. */
607int
608scm_i_string_contains_char (SCM str, char ch)
609{
610 size_t i;
611 size_t len = scm_i_string_length (str);
612
613 i = 0;
614 if (scm_i_is_narrow_string (str))
615 {
616 while (i < len)
617 {
618 if (scm_i_string_chars (str)[i] == ch)
619 return i+1;
620 i++;
621 }
622 }
623 else
624 {
625 while (i < len)
626 {
627 if (scm_i_string_wide_chars (str)[i]
628 == (unsigned char) ch)
629 return i+1;
630 i++;
631 }
632 }
633 return 0;
634}
635
3f47e526
MG
636int
637scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
638{
639 if (scm_i_is_narrow_string (sstr))
640 {
641 const char *a = scm_i_string_chars (sstr) + start_x;
642 const char *b = cstr;
643 return strncmp (a, b, strlen(b));
644 }
645 else
646 {
647 size_t i;
648 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
649 const char *b = cstr;
650 for (i = 0; i < strlen (b); i++)
651 {
652 if (a[i] != (unsigned char) b[i])
653 return 1;
654 }
655 }
656 return 0;
657}
658
50b1996f 659/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
660void
661scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
662{
663 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 664 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
665
666 if (scm_i_is_narrow_string (str))
667 {
668 char *dst = scm_i_string_writable_chars (str);
587a3355 669 dst[p] = chr;
9c44cd45
MG
670 }
671 else
672 {
673 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
674 dst[p] = chr;
675 }
676}
677
ba54a202 678\f
3ee86942 679/* Symbols.
587a3355 680
3ee86942
MV
681 Basic symbol creation and accessing is done here, the rest is in
682 symbols.[hc]. This has been done to keep stringbufs and the
683 internals of strings and string-like objects confined to this file.
684*/
685
686#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
687
688SCM
6869328b
MV
689scm_i_make_symbol (SCM name, scm_t_bits flags,
690 unsigned long hash, SCM props)
3ee86942
MV
691{
692 SCM buf;
693 size_t start = STRING_START (name);
694 size_t length = STRING_LENGTH (name);
695
696 if (IS_SH_STRING (name))
697 {
698 name = SH_STRING_STRING (name);
699 start += STRING_START (name);
700 }
701 buf = SYMBOL_STRINGBUF (name);
702
703 if (start == 0 && length == STRINGBUF_LENGTH (buf))
704 {
705 /* reuse buf. */
9de87eea 706 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 707 SET_STRINGBUF_SHARED (buf);
9de87eea 708 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
709 }
710 else
711 {
712 /* make new buf. */
9c44cd45
MG
713 if (scm_i_is_narrow_string (name))
714 {
715 SCM new_buf = make_stringbuf (length);
716 memcpy (STRINGBUF_CHARS (new_buf),
717 STRINGBUF_CHARS (buf) + start, length);
718 buf = new_buf;
719 }
720 else
721 {
722 SCM new_buf = make_wide_stringbuf (length);
723 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
724 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
725 length);
726 buf = new_buf;
727 }
3ee86942 728 }
6869328b 729 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
730 (scm_t_bits) hash, SCM_UNPACK (props));
731}
732
fd0a5bbc
HWN
733SCM
734scm_i_c_make_symbol (const char *name, size_t len,
735 scm_t_bits flags, unsigned long hash, SCM props)
736{
737 SCM buf = make_stringbuf (len);
738 memcpy (STRINGBUF_CHARS (buf), name, len);
739
a284cc7e
LC
740 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
741 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
742}
743
50b1996f
MG
744/* Returns the number of characters in SYM. This may be different
745 from the memory size of SYM. */
3ee86942
MV
746size_t
747scm_i_symbol_length (SCM sym)
0f2d19dd 748{
3ee86942 749 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
750}
751
071bb6a8
LC
752size_t
753scm_c_symbol_length (SCM sym)
754#define FUNC_NAME "scm_c_symbol_length"
755{
756 SCM_VALIDATE_SYMBOL (1, sym);
757
758 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
759}
760#undef FUNC_NAME
761
50b1996f
MG
762/* True if the name of SYM is stored as a Latin-1 encoded string.
763 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
764int
765scm_i_is_narrow_symbol (SCM sym)
766{
767 SCM buf;
768
769 buf = SYMBOL_STRINGBUF (sym);
770 return !STRINGBUF_WIDE (buf);
771}
772
50b1996f
MG
773/* Returns a pointer to the 8-bit Latin-1 encoded character array that
774 contains the name of SYM. */
3ee86942
MV
775const char *
776scm_i_symbol_chars (SCM sym)
777{
9c44cd45
MG
778 SCM buf;
779
780 buf = SYMBOL_STRINGBUF (sym);
781 if (!STRINGBUF_WIDE (buf))
f59cf998 782 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
783 else
784 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
785 scm_list_1 (sym));
786}
787
50b1996f
MG
788/* Return a pointer to the 32-bit UCS-4-encoded character array of a
789 symbol's name. */
9c44cd45
MG
790const scm_t_wchar *
791scm_i_symbol_wide_chars (SCM sym)
792{
793 SCM buf;
794
795 buf = SYMBOL_STRINGBUF (sym);
796 if (STRINGBUF_WIDE (buf))
f59cf998 797 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
798 else
799 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
800 scm_list_1 (sym));
3ee86942 801}
1cc91f1b 802
be54b15d 803SCM
3ee86942 804scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 805{
3ee86942 806 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 807 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 808 SET_STRINGBUF_SHARED (buf);
9de87eea 809 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 810 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
811 (scm_t_bits)start, (scm_t_bits) end - start);
812}
be54b15d 813
50b1996f 814/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
815scm_t_wchar
816scm_i_symbol_ref (SCM sym, size_t x)
817{
818 if (scm_i_is_narrow_symbol (sym))
819 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
820 else
821 return scm_i_symbol_wide_chars (sym)[x];
822}
823
3ee86942
MV
824/* Debugging
825 */
be54b15d 826
6ce6923b
MG
827SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
828 "Returns an association list containing debugging information\n"
829 "for @var{str}. The association list has the following entries."
830 "@table @code\n"
831 "@item string\n"
832 "The string itself.\n"
833 "@item start\n"
834 "The start index of the string into its stringbuf\n"
835 "@item length\n"
836 "The length of the string\n"
837 "@item shared\n"
838 "If this string is a substring, it returns its parent string.\n"
839 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
840 "@item read-only\n"
841 "@code{#t} if the string is read-only\n"
6ce6923b
MG
842 "@item stringbuf-chars\n"
843 "A new string containing this string's stringbuf's characters\n"
844 "@item stringbuf-length\n"
845 "The number of characters in this stringbuf\n"
846 "@item stringbuf-shared\n"
847 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
848 "@item stringbuf-wide\n"
849 "@code{#t} if this stringbuf's characters are stored in a\n"
850 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
851 "buffer\n"
852 "@end table")
3ee86942
MV
853#define FUNC_NAME s_scm_sys_string_dump
854{
ba54a202 855 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 856 SCM buf;
3ee86942 857 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
858
859 /* String info */
860 e1 = scm_cons (scm_from_locale_symbol ("string"),
861 str);
862 e2 = scm_cons (scm_from_locale_symbol ("start"),
863 scm_from_size_t (STRING_START (str)));
864 e3 = scm_cons (scm_from_locale_symbol ("length"),
865 scm_from_size_t (STRING_LENGTH (str)));
866
3ee86942
MV
867 if (IS_SH_STRING (str))
868 {
6ce6923b
MG
869 e4 = scm_cons (scm_from_locale_symbol ("shared"),
870 SH_STRING_STRING (str));
871 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
872 }
873 else
874 {
6ce6923b
MG
875 e4 = scm_cons (scm_from_locale_symbol ("shared"),
876 SCM_BOOL_F);
877 buf = STRING_STRINGBUF (str);
3ee86942 878 }
9c44cd45 879
88ed5759
MG
880 if (IS_RO_STRING (str))
881 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
882 SCM_BOOL_T);
883 else
884 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
885 SCM_BOOL_F);
587a3355 886
6ce6923b 887 /* Stringbuf info */
6ce6923b
MG
888 if (!STRINGBUF_WIDE (buf))
889 {
890 size_t len = STRINGBUF_LENGTH (buf);
891 char *cbuf;
892 SCM sbc = scm_i_make_string (len, &cbuf);
893 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
894 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
895 sbc);
3ee86942 896 }
6ce6923b
MG
897 else
898 {
899 size_t len = STRINGBUF_LENGTH (buf);
900 scm_t_wchar *cbuf;
901 SCM sbc = scm_i_make_wide_string (len, &cbuf);
902 u32_cpy ((scm_t_uint32 *) cbuf,
903 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
904 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
905 sbc);
906 }
907 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
908 scm_from_size_t (STRINGBUF_LENGTH (buf)));
909 if (STRINGBUF_SHARED (buf))
910 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
911 SCM_BOOL_T);
912 else
913 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
914 SCM_BOOL_F);
6ce6923b 915 if (STRINGBUF_WIDE (buf))
ba54a202
LC
916 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
917 SCM_BOOL_T);
6ce6923b 918 else
ba54a202
LC
919 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
920 SCM_BOOL_F);
6ce6923b 921
ba54a202 922 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
923}
924#undef FUNC_NAME
925
6ce6923b
MG
926SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
927 "Returns an association list containing debugging information\n"
928 "for @var{sym}. The association list has the following entries."
929 "@table @code\n"
930 "@item symbol\n"
931 "The symbol itself\n"
932 "@item hash\n"
933 "Its hash value\n"
88ed5759
MG
934 "@item interned\n"
935 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
936 "@item stringbuf-chars\n"
937 "A new string containing this symbols's stringbuf's characters\n"
938 "@item stringbuf-length\n"
939 "The number of characters in this stringbuf\n"
940 "@item stringbuf-shared\n"
941 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
942 "@item stringbuf-wide\n"
943 "@code{#t} if this stringbuf's characters are stored in a\n"
944 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
945 "buffer\n"
946 "@end table")
3ee86942
MV
947#define FUNC_NAME s_scm_sys_symbol_dump
948{
ba54a202 949 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 950 SCM buf;
3ee86942 951 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
952 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
953 sym);
954 e2 = scm_cons (scm_from_locale_symbol ("hash"),
955 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
956 e3 = scm_cons (scm_from_locale_symbol ("interned"),
957 scm_symbol_interned_p (sym));
6ce6923b
MG
958 buf = SYMBOL_STRINGBUF (sym);
959
960 /* Stringbuf info */
6ce6923b
MG
961 if (!STRINGBUF_WIDE (buf))
962 {
963 size_t len = STRINGBUF_LENGTH (buf);
964 char *cbuf;
965 SCM sbc = scm_i_make_string (len, &cbuf);
966 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
967 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
968 sbc);
969 }
9c44cd45 970 else
6ce6923b
MG
971 {
972 size_t len = STRINGBUF_LENGTH (buf);
973 scm_t_wchar *cbuf;
974 SCM sbc = scm_i_make_wide_string (len, &cbuf);
975 u32_cpy ((scm_t_uint32 *) cbuf,
976 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
977 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
978 sbc);
979 }
980 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
981 scm_from_size_t (STRINGBUF_LENGTH (buf)));
982 if (STRINGBUF_SHARED (buf))
983 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
984 SCM_BOOL_T);
985 else
986 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
987 SCM_BOOL_F);
6ce6923b 988 if (STRINGBUF_WIDE (buf))
ba54a202 989 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b
MG
990 SCM_BOOL_T);
991 else
ba54a202 992 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b 993 SCM_BOOL_F);
ba54a202 994 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 995
3ee86942
MV
996}
997#undef FUNC_NAME
998
56a3dcd4 999#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1000
9c44cd45 1001SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1002#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1003{
1004 int i;
1005 for (i = 0; i < 1000; i++)
1006 if (lenhist[i])
1007 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1008 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1009 return SCM_UNSPECIFIED;
be54b15d
DH
1010}
1011#undef FUNC_NAME
1012
3ee86942
MV
1013#endif
1014
1015\f
1016
1017SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1018 (SCM obj),
1019 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1020#define FUNC_NAME s_scm_string_p
1021{
1022 return scm_from_bool (IS_STRING (obj));
1023}
1024#undef FUNC_NAME
1025
1026
1027SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1028
1029SCM_DEFINE (scm_string, "string", 0, 0, 1,
1030 (SCM chrs),
1031 "@deffnx {Scheme Procedure} list->string chrs\n"
1032 "Return a newly allocated string composed of the arguments,\n"
1033 "@var{chrs}.")
1034#define FUNC_NAME s_scm_string
1035{
9aa27c1a 1036 SCM result = SCM_BOOL_F;
9c44cd45 1037 SCM rest;
3ee86942 1038 size_t len;
9c44cd45
MG
1039 size_t p = 0;
1040 long i;
9aa27c1a 1041 int wide = 0;
3ee86942 1042
9c44cd45
MG
1043 /* Verify that this is a list of chars. */
1044 i = scm_ilength (chrs);
3c7cf7f5 1045 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1046
9c44cd45
MG
1047 len = (size_t) i;
1048 rest = chrs;
3ee86942 1049
9c44cd45 1050 while (len > 0 && scm_is_pair (rest))
3ee86942 1051 {
9c44cd45 1052 SCM elt = SCM_CAR (rest);
3ee86942 1053 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1054 if (SCM_CHAR (elt) > 0xFF)
1055 wide = 1;
9c44cd45
MG
1056 rest = SCM_CDR (rest);
1057 len--;
1058 scm_remember_upto_here_1 (elt);
1059 }
1060
1061 /* Construct a string containing this list of chars. */
1062 len = (size_t) i;
1063 rest = chrs;
1064
9aa27c1a 1065 if (wide == 0)
9c44cd45 1066 {
56a3dcd4
LC
1067 char *buf;
1068
9aa27c1a
MG
1069 result = scm_i_make_string (len, NULL);
1070 result = scm_i_string_start_writing (result);
56a3dcd4 1071 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1072 while (len > 0 && scm_is_pair (rest))
1073 {
1074 SCM elt = SCM_CAR (rest);
1075 buf[p] = (unsigned char) SCM_CHAR (elt);
1076 p++;
1077 rest = SCM_CDR (rest);
1078 len--;
1079 scm_remember_upto_here_1 (elt);
1080 }
1081 }
1082 else
1083 {
56a3dcd4
LC
1084 scm_t_wchar *buf;
1085
9aa27c1a
MG
1086 result = scm_i_make_wide_string (len, NULL);
1087 result = scm_i_string_start_writing (result);
56a3dcd4 1088 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1089 while (len > 0 && scm_is_pair (rest))
1090 {
1091 SCM elt = SCM_CAR (rest);
1092 buf[p] = SCM_CHAR (elt);
1093 p++;
1094 rest = SCM_CDR (rest);
1095 len--;
1096 scm_remember_upto_here_1 (elt);
1097 }
3ee86942 1098 }
9c44cd45
MG
1099 scm_i_string_stop_writing ();
1100
3ee86942
MV
1101 if (len > 0)
1102 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1103 if (!scm_is_null (rest))
3ee86942
MV
1104 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1105
1106 return result;
1107}
1108#undef FUNC_NAME
be54b15d 1109
3b3b36dd 1110SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1111 (SCM k, SCM chr),
0d26a824
MG
1112 "Return a newly allocated string of\n"
1113 "length @var{k}. If @var{chr} is given, then all elements of\n"
1114 "the string are initialized to @var{chr}, otherwise the contents\n"
3ef6650d 1115 "of the @var{string} are all set to @var{#\nul}.")
1bbd0b84 1116#define FUNC_NAME s_scm_make_string
0f2d19dd 1117{
3ee86942
MV
1118 return scm_c_make_string (scm_to_size_t (k), chr);
1119}
1120#undef FUNC_NAME
1121
1122SCM
1123scm_c_make_string (size_t len, SCM chr)
1124#define FUNC_NAME NULL
1125{
9c44cd45 1126 size_t p;
3ef6650d
AW
1127 char *contents = NULL;
1128 SCM res = scm_i_make_string (len, &contents);
cb0d8be2 1129
3ef6650d
AW
1130 /* If no char is given, initialize string contents to NULL. */
1131 if (SCM_UNBNDP (chr))
1132 memset (contents, 0, len);
1133 else
e11e83f3 1134 {
3ee86942 1135 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1136 res = scm_i_string_start_writing (res);
1137 for (p = 0; p < len; p++)
1138 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1139 scm_i_string_stop_writing ();
0f2d19dd 1140 }
e11e83f3
MV
1141
1142 return res;
0f2d19dd 1143}
1bbd0b84 1144#undef FUNC_NAME
0f2d19dd 1145
3b3b36dd 1146SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1147 (SCM string),
1148 "Return the number of characters in @var{string}.")
1bbd0b84 1149#define FUNC_NAME s_scm_string_length
0f2d19dd 1150{
d1ca2c64 1151 SCM_VALIDATE_STRING (1, string);
3ee86942 1152 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1153}
1bbd0b84 1154#undef FUNC_NAME
0f2d19dd 1155
f8ba2bb9 1156SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1157 (SCM string),
1158 "Return the bytes used to represent a character in @var{string}."
1159 "This will return 1 or 4.")
f8ba2bb9 1160#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1161{
1162 SCM_VALIDATE_STRING (1, string);
1163 if (!scm_i_is_narrow_string (string))
1164 return scm_from_int (4);
1165
1166 return scm_from_int (1);
1167}
1168#undef FUNC_NAME
1169
3ee86942
MV
1170size_t
1171scm_c_string_length (SCM string)
1172{
1173 if (!IS_STRING (string))
1174 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1175 return STRING_LENGTH (string);
1176}
1177
bd9e24b3 1178SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1179 (SCM str, SCM k),
9c44cd45
MG
1180 "Return character @var{k} of @var{str} using zero-origin\n"
1181 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1182#define FUNC_NAME s_scm_string_ref
0f2d19dd 1183{
3ae3166b 1184 size_t len;
a55c2b68 1185 unsigned long idx;
bd9e24b3 1186
d1ca2c64 1187 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1188
1189 len = scm_i_string_length (str);
1190 if (SCM_LIKELY (len > 0))
1191 idx = scm_to_unsigned_integer (k, 0, len - 1);
1192 else
1193 scm_out_of_range (NULL, k);
1194
9c44cd45
MG
1195 if (scm_i_is_narrow_string (str))
1196 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1197 else
1198 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1199}
1bbd0b84 1200#undef FUNC_NAME
0f2d19dd 1201
3ee86942
MV
1202SCM
1203scm_c_string_ref (SCM str, size_t p)
1204{
1205 if (p >= scm_i_string_length (str))
1206 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1207 if (scm_i_is_narrow_string (str))
1208 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1209 else
1210 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1211
3ee86942 1212}
f0942910 1213
3b3b36dd 1214SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1215 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1216 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1217 "an unspecified value. @var{k} must be a valid index of\n"
1218 "@var{str}.")
1bbd0b84 1219#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1220{
3ae3166b 1221 size_t len;
a55c2b68
MV
1222 unsigned long idx;
1223
f0942910 1224 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1225
1226 len = scm_i_string_length (str);
1227 if (SCM_LIKELY (len > 0))
1228 idx = scm_to_unsigned_integer (k, 0, len - 1);
1229 else
1230 scm_out_of_range (NULL, k);
1231
34d19ef6 1232 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1233 str = scm_i_string_start_writing (str);
1234 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1235 scm_i_string_stop_writing ();
1236
0f2d19dd
JB
1237 return SCM_UNSPECIFIED;
1238}
1bbd0b84 1239#undef FUNC_NAME
0f2d19dd 1240
3ee86942
MV
1241void
1242scm_c_string_set_x (SCM str, size_t p, SCM chr)
1243{
1244 if (p >= scm_i_string_length (str))
1245 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1246 str = scm_i_string_start_writing (str);
1247 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1248 scm_i_string_stop_writing ();
3ee86942 1249}
0f2d19dd 1250
3b3b36dd 1251SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1252 (SCM str, SCM start, SCM end),
1253 "Return a newly allocated string formed from the characters\n"
1254 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1255 "ending with index @var{end} (exclusive).\n"
1256 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1257 "exact integers satisfying:\n\n"
1258 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1259#define FUNC_NAME s_scm_substring
0f2d19dd 1260{
3ee86942 1261 size_t len, from, to;
685c0d71 1262
d1ca2c64 1263 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1264 len = scm_i_string_length (str);
1265 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1266 if (SCM_UNBNDP (end))
3ee86942 1267 to = len;
a55c2b68 1268 else
3ee86942
MV
1269 to = scm_to_unsigned_integer (end, from, len);
1270 return scm_i_substring (str, from, to);
0f2d19dd 1271}
1bbd0b84 1272#undef FUNC_NAME
0f2d19dd 1273
ed35de72
MV
1274SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1275 (SCM str, SCM start, SCM end),
1276 "Return a newly allocated string formed from the characters\n"
1277 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1278 "ending with index @var{end} (exclusive).\n"
1279 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1280 "exact integers satisfying:\n"
1281 "\n"
1282 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1283 "\n"
1284 "The returned string is read-only.\n")
1285#define FUNC_NAME s_scm_substring_read_only
1286{
1287 size_t len, from, to;
1288
1289 SCM_VALIDATE_STRING (1, str);
1290 len = scm_i_string_length (str);
1291 from = scm_to_unsigned_integer (start, 0, len);
1292 if (SCM_UNBNDP (end))
1293 to = len;
1294 else
1295 to = scm_to_unsigned_integer (end, from, len);
1296 return scm_i_substring_read_only (str, from, to);
1297}
1298#undef FUNC_NAME
1299
3ee86942
MV
1300SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1301 (SCM str, SCM start, SCM end),
1302 "Return a newly allocated string formed from the characters\n"
1303 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1304 "ending with index @var{end} (exclusive).\n"
1305 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1306 "exact integers satisfying:\n\n"
1307 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1308#define FUNC_NAME s_scm_substring_copy
1309{
e1b29f6a
MV
1310 /* For the Scheme version, START is mandatory, but for the C
1311 version, it is optional. See scm_string_copy in srfi-13.c for a
1312 rationale.
1313 */
1314
1315 size_t from, to;
3ee86942
MV
1316
1317 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1318 scm_i_get_substring_spec (scm_i_string_length (str),
1319 start, &from, end, &to);
3ee86942
MV
1320 return scm_i_substring_copy (str, from, to);
1321}
1322#undef FUNC_NAME
1323
1324SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1325 (SCM str, SCM start, SCM end),
1326 "Return string that indirectly refers to the characters\n"
1327 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1328 "ending with index @var{end} (exclusive).\n"
1329 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1330 "exact integers satisfying:\n\n"
1331 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1332#define FUNC_NAME s_scm_substring_shared
1333{
1334 size_t len, from, to;
1335
1336 SCM_VALIDATE_STRING (1, str);
1337 len = scm_i_string_length (str);
1338 from = scm_to_unsigned_integer (start, 0, len);
1339 if (SCM_UNBNDP (end))
1340 to = len;
1341 else
1342 to = scm_to_unsigned_integer (end, from, len);
1343 return scm_i_substring_shared (str, from, to);
1344}
1345#undef FUNC_NAME
685c0d71 1346
3b3b36dd 1347SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1348 (SCM args),
9c44cd45 1349 "Return a newly allocated string whose characters form the\n"
0d26a824 1350 "concatenation of the given strings, @var{args}.")
1bbd0b84 1351#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1352{
1353 SCM res;
9c44cd45
MG
1354 size_t len = 0;
1355 int wide = 0;
c829a427 1356 SCM l, s;
bd4911ef 1357 size_t i;
9909c395
MG
1358 union
1359 {
1360 char *narrow;
1361 scm_t_wchar *wide;
1362 } data;
af45e3b0
DH
1363
1364 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1365 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1366 {
1367 s = SCM_CAR (l);
1368 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1369 len += scm_i_string_length (s);
1370 if (!scm_i_is_narrow_string (s))
1371 wide = 1;
c829a427 1372 }
9909c395 1373 data.narrow = NULL;
9c44cd45 1374 if (!wide)
9909c395 1375 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1376 else
9909c395 1377 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1378
1379 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1380 {
edea856c 1381 size_t len;
c829a427 1382 s = SCM_CAR (l);
3ee86942 1383 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1384 len = scm_i_string_length (s);
9c44cd45
MG
1385 if (!wide)
1386 {
9909c395
MG
1387 memcpy (data.narrow, scm_i_string_chars (s), len);
1388 data.narrow += len;
9c44cd45
MG
1389 }
1390 else
1391 {
1392 if (scm_i_is_narrow_string (s))
1393 {
1394 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1395 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1396 }
1397 else
9909c395 1398 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1399 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1400 data.wide += len;
9c44cd45 1401 }
c829a427
MV
1402 scm_remember_upto_here_1 (s);
1403 }
0f2d19dd
JB
1404 return res;
1405}
1bbd0b84 1406#undef FUNC_NAME
0f2d19dd 1407
24933780 1408
a3d7d5d5
LC
1409\f
1410/* Conversion to/from other encodings. */
1411
1412SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
d14418a5 1413void
ef7e4ba3
LC
1414scm_encoding_error (const char *subr, int err, const char *message,
1415 const char *from, const char *to, SCM string_or_bv)
1416{
1417 /* Raise an exception that conveys all the information needed to debug the
1418 problem. Only perform locale conversions that are safe; in particular,
1419 don't try to display STRING_OR_BV when it's a string since converting it to
1420 the output locale may fail. */
1421 scm_throw (scm_encoding_error_key,
1422 scm_list_n (scm_from_locale_string (subr),
1423 scm_from_locale_string (message),
1424 scm_from_int (err),
1425 scm_from_locale_string (from),
1426 scm_from_locale_string (to),
1427 string_or_bv,
1428 SCM_UNDEFINED));
a3d7d5d5
LC
1429}
1430
fac32b51 1431SCM
587a3355
MG
1432scm_from_stringn (const char *str, size_t len, const char *encoding,
1433 scm_t_string_failed_conversion_handler handler)
1434{
1435 size_t u32len, i;
1436 scm_t_wchar *u32;
1437 int wide = 0;
1438 SCM res;
1439
fac32b51
MG
1440 if (len == 0)
1441 return scm_nullstr;
1442
889975e5
MG
1443 if (encoding == NULL)
1444 {
1445 /* If encoding is null, use Latin-1. */
1446 char *buf;
1447 res = scm_i_make_string (len, &buf);
1448 memcpy (buf, str, len);
1449 return res;
1450 }
1451
587a3355
MG
1452 u32len = 0;
1453 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1454 (enum iconv_ilseq_handler)
1455 handler,
1456 str, len,
1457 NULL,
1458 NULL, &u32len);
1459
ef7e4ba3 1460 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1461 {
ef7e4ba3
LC
1462 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1463 handler. */
1464 SCM bv;
1465 signed char *buf;
1466
1467 buf = scm_gc_malloc_pointerless (len, "bytevector");
1468 memcpy (buf, str, len);
1469 bv = scm_c_take_bytevector (buf, len);
1470
1471 scm_encoding_error (__func__, errno,
1472 "input locale conversion error",
1473 encoding, "UTF-32", bv);
587a3355
MG
1474 }
1475
1476 i = 0;
1477 while (i < u32len)
1478 if (u32[i++] > 0xFF)
1479 {
1480 wide = 1;
1481 break;
1482 }
1483
1484 if (!wide)
1485 {
1486 char *dst;
1487 res = scm_i_make_string (u32len, &dst);
1488 for (i = 0; i < u32len; i ++)
1489 dst[i] = (unsigned char) u32[i];
1490 dst[u32len] = '\0';
1491 }
1492 else
1493 {
1494 scm_t_wchar *wdst;
1495 res = scm_i_make_wide_string (u32len, &wdst);
1496 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1497 wdst[u32len] = 0;
1498 }
1499
1500 free (u32);
1501 return res;
1502}
1503
cf313a94
MG
1504SCM
1505scm_from_latin1_stringn (const char *str, size_t len)
1506{
1507 return scm_from_stringn (str, len, NULL, SCM_FAILED_CONVERSION_ERROR);
1508}
1509
c829a427
MV
1510SCM
1511scm_from_locale_stringn (const char *str, size_t len)
1512{
889975e5
MG
1513 const char *enc;
1514 scm_t_string_failed_conversion_handler hndl;
1515 SCM inport;
1516 scm_t_port *pt;
4d4528e7 1517
9c44cd45 1518 if (len == (size_t) -1)
c829a427 1519 len = strlen (str);
9c44cd45
MG
1520 if (len == 0)
1521 return scm_nullstr;
1522
889975e5
MG
1523 inport = scm_current_input_port ();
1524 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1525 {
1526 pt = SCM_PTAB_ENTRY (inport);
1527 enc = pt->encoding;
1528 hndl = pt->ilseq_handler;
1529 }
1530 else
1531 {
1532 enc = NULL;
1533 hndl = SCM_FAILED_CONVERSION_ERROR;
1534 }
1535
1536 return scm_from_stringn (str, len, enc, hndl);
c829a427 1537}
4d4528e7 1538
c829a427
MV
1539SCM
1540scm_from_locale_string (const char *str)
4d4528e7 1541{
9c44cd45
MG
1542 if (str == NULL)
1543 return scm_nullstr;
1544
c829a427
MV
1545 return scm_from_locale_stringn (str, -1);
1546}
4d4528e7 1547
587a3355
MG
1548SCM
1549scm_i_from_utf8_string (const scm_t_uint8 *str)
1550{
1551 return scm_from_stringn ((const char *) str,
1552 strlen ((char *) str), "UTF-8",
1553 SCM_FAILED_CONVERSION_ERROR);
1554}
1555
50b1996f
MG
1556/* Create a new scheme string from the C string STR. The memory of
1557 STR may be used directly as storage for the new string. */
13a94556
LC
1558/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1559 would be to register a finalizer to eventually free(3) STR, which isn't
1560 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1561SCM
1562scm_take_locale_stringn (char *str, size_t len)
1563{
13a94556 1564 SCM res;
48ddf0d9 1565
13a94556
LC
1566 res = scm_from_locale_stringn (str, len);
1567 free (str);
c829a427 1568
c829a427
MV
1569 return res;
1570}
1571
48ddf0d9
KR
1572SCM
1573scm_take_locale_string (char *str)
1574{
1575 return scm_take_locale_stringn (str, -1);
1576}
1577
f1ee6d54
LC
1578/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1579 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1580 Set *LENP to the size of the resulting string. */
4ff2b9f4
LC
1581void
1582scm_i_unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1583{
1584 char *before, *after;
1585 size_t i, j;
1586
4ff2b9f4
LC
1587 before = buf;
1588 after = buf;
9c44cd45
MG
1589 i = 0;
1590 j = 0;
1591 while (i < *lenp)
1592 {
1593 if ((i <= *lenp - 6)
1594 && before[i] == '\\'
1595 && before[i + 1] == 'u'
1596 && before[i + 2] == '0' && before[i + 3] == '0')
1597 {
1598 /* Convert \u00NN to \xNN */
1599 after[j] = '\\';
1600 after[j + 1] = 'x';
30a6b9ca
MG
1601 after[j + 2] = tolower ((int) before[i + 4]);
1602 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1603 i += 6;
1604 j += 4;
1605 }
1606 else if ((i <= *lenp - 10)
1607 && before[i] == '\\'
1608 && before[i + 1] == 'U'
1609 && before[i + 2] == '0' && before[i + 3] == '0')
1610 {
1611 /* Convert \U00NNNNNN to \UNNNNNN */
1612 after[j] = '\\';
1613 after[j + 1] = 'U';
30a6b9ca
MG
1614 after[j + 2] = tolower ((int) before[i + 4]);
1615 after[j + 3] = tolower ((int) before[i + 5]);
1616 after[j + 4] = tolower ((int) before[i + 6]);
1617 after[j + 5] = tolower ((int) before[i + 7]);
1618 after[j + 6] = tolower ((int) before[i + 8]);
1619 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1620 i += 10;
1621 j += 8;
1622 }
1623 else
1624 {
1625 after[j] = before[i];
1626 i++;
1627 j++;
1628 }
1629 }
1630 *lenp = j;
9c44cd45
MG
1631}
1632
f1ee6d54
LC
1633/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1634 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1635 of the resulting string. BUF must be large enough to handle the
1636 worst case when `\uXXXX' escapes (6 characters) are replaced by
1637 `\xXXXX;' (7 characters). */
4ff2b9f4
LC
1638void
1639scm_i_unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1640{
1641 char *before, *after;
1642 size_t i, j;
1643 /* The worst case is if the input string contains all 4-digit hex escapes.
1644 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1645 size_t max_out_len = (*lenp * 7) / 6 + 1;
1646 size_t nzeros, ndigits;
1647
4ff2b9f4 1648 before = buf;
d31b9519
MG
1649 after = alloca (max_out_len);
1650 i = 0;
1651 j = 0;
1652 while (i < *lenp)
1653 {
1654 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1655 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1656 {
1657 if (before[i + 1] == 'u')
1658 ndigits = 4;
1659 else if (before[i + 1] == 'U')
1660 ndigits = 8;
1661 else
1662 abort ();
1663
1664 /* Add the R6RS hex escape initial sequence. */
1665 after[j] = '\\';
1666 after[j + 1] = 'x';
1667
1668 /* Move string positions to the start of the hex numbers. */
1669 i += 2;
1670 j += 2;
1671
1672 /* Find the number of initial zeros in this hex number. */
1673 nzeros = 0;
1674 while (before[i + nzeros] == '0' && nzeros < ndigits)
1675 nzeros++;
1676
1677 /* Copy the number, skipping initial zeros, and then move the string
1678 positions. */
1679 if (nzeros == ndigits)
1680 {
1681 after[j] = '0';
1682 i += ndigits;
1683 j += 1;
1684 }
1685 else
1686 {
1687 int pos;
1688 for (pos = 0; pos < ndigits - nzeros; pos++)
1689 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1690 i += ndigits;
1691 j += (ndigits - nzeros);
1692 }
1693
1694 /* Add terminating semicolon. */
1695 after[j] = ';';
1696 j++;
1697 }
1698 else
1699 {
1700 after[j] = before[i];
1701 i++;
1702 j++;
1703 }
1704 }
1705 *lenp = j;
d31b9519
MG
1706 memcpy (before, after, j);
1707}
1708
cf313a94
MG
1709char *
1710scm_to_latin1_stringn (SCM str, size_t *lenp)
1711{
1712 return scm_to_stringn (str, lenp, NULL, SCM_FAILED_CONVERSION_ERROR);
1713}
d31b9519 1714
c829a427 1715char *
fac32b51 1716scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1717{
889975e5
MG
1718 SCM outport;
1719 scm_t_port *pt;
9c44cd45
MG
1720 const char *enc;
1721
889975e5
MG
1722 outport = scm_current_output_port ();
1723 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1724 {
1725 pt = SCM_PTAB_ENTRY (outport);
1726 enc = pt->encoding;
1727 }
1728 else
1729 enc = NULL;
9c44cd45 1730
889975e5
MG
1731 return scm_to_stringn (str, lenp,
1732 enc,
1733 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1734}
1735
29bcdbb0
LC
1736/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1737 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1738 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1739 defined by HANDLER. */
9c44cd45 1740char *
587a3355 1741scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1742 scm_t_string_failed_conversion_handler handler)
9c44cd45 1743{
9c44cd45
MG
1744 char *buf;
1745 size_t ilen, len, i;
889975e5
MG
1746 int ret;
1747 const char *enc;
4d4528e7 1748
3ee86942 1749 if (!scm_is_string (str))
c829a427 1750 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1751 ilen = scm_i_string_length (str);
1752
1753 if (ilen == 0)
1754 {
1755 buf = scm_malloc (1);
1756 buf[0] = '\0';
1757 if (lenp)
1758 *lenp = 0;
1759 return buf;
1760 }
587a3355 1761
c829a427 1762 if (lenp == NULL)
9c44cd45
MG
1763 for (i = 0; i < ilen; i++)
1764 if (scm_i_string_ref (str, i) == '\0')
1765 scm_misc_error (NULL,
1766 "string contains #\\nul character: ~S",
1767 scm_list_1 (str));
1768
889975e5 1769 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1770 {
889975e5
MG
1771 /* If using native Latin-1 encoding, just copy the string
1772 contents. */
9c44cd45
MG
1773 if (lenp)
1774 {
1775 buf = scm_malloc (ilen);
1776 memcpy (buf, scm_i_string_chars (str), ilen);
1777 *lenp = ilen;
1778 return buf;
1779 }
1780 else
1781 {
1782 buf = scm_malloc (ilen + 1);
1783 memcpy (buf, scm_i_string_chars (str), ilen);
1784 buf[ilen] = '\0';
1785 return buf;
1786 }
c829a427 1787 }
9c44cd45 1788
587a3355 1789
9c44cd45
MG
1790 buf = NULL;
1791 len = 0;
889975e5
MG
1792 enc = encoding;
1793 if (enc == NULL)
1794 enc = "ISO-8859-1";
1795 if (scm_i_is_narrow_string (str))
1796 {
1797 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1798 "ISO-8859-1", enc,
1799 (enum iconv_ilseq_handler) handler, NULL,
1800 &buf, &len);
9c44cd45 1801
889975e5 1802 if (ret != 0)
ef7e4ba3
LC
1803 scm_encoding_error (__func__, errno,
1804 "cannot convert to output locale",
1805 "ISO-8859-1", enc, str);
889975e5
MG
1806 }
1807 else
1808 {
d31b9519 1809 buf = u32_conv_to_encoding (enc,
889975e5 1810 (enum iconv_ilseq_handler) handler,
d31b9519 1811 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
1812 ilen,
1813 NULL,
1814 NULL, &len);
1815 if (buf == NULL)
ef7e4ba3
LC
1816 scm_encoding_error (__func__, errno,
1817 "cannot convert to output locale",
1818 "UTF-32", enc, str);
d31b9519
MG
1819 }
1820 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1821 {
1822 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
1823 {
1824 /* The worst case is if the input string contains all 4-digit
1825 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
1826 (seven characters). Make BUF large enough to hold
1827 that. */
1828 buf = scm_realloc (buf, (len * 7) / 6 + 1);
1829 scm_i_unistring_escapes_to_r6rs_escapes (buf, &len);
1830 }
d31b9519 1831 else
4ff2b9f4
LC
1832 scm_i_unistring_escapes_to_guile_escapes (buf, &len);
1833
1834 buf = scm_realloc (buf, len);
889975e5 1835 }
9c44cd45 1836 if (lenp)
4d4528e7 1837 *lenp = len;
9c44cd45
MG
1838 else
1839 {
1840 buf = scm_realloc (buf, len + 1);
1841 buf[len] = '\0';
1842 }
24933780 1843
c829a427 1844 scm_remember_upto_here_1 (str);
9c44cd45 1845 return buf;
4d4528e7 1846}
af68e5e5 1847
c829a427
MV
1848char *
1849scm_to_locale_string (SCM str)
1850{
1851 return scm_to_locale_stringn (str, NULL);
1852}
af68e5e5 1853
587a3355
MG
1854scm_t_uint8 *
1855scm_i_to_utf8_string (SCM str)
1856{
1857 char *u8str;
1858 u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1859 return (scm_t_uint8 *) u8str;
1860}
1861
c829a427
MV
1862size_t
1863scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1864{
1865 size_t len;
9c44cd45 1866 char *result = NULL;
3ee86942 1867 if (!scm_is_string (str))
c829a427 1868 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1869 result = scm_to_locale_stringn (str, &len);
1870
1871 memcpy (buf, result, (len > max_len) ? max_len : len);
1872 free (result);
1873
c829a427
MV
1874 scm_remember_upto_here_1 (str);
1875 return len;
1876}
af68e5e5 1877
a3d7d5d5
LC
1878\f
1879/* Unicode string normalization. */
1880
edb7bb47
JG
1881/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1882 libguile/i18n.c. It would be useful to have this factored out into a more
1883 convenient location, but its use of alloca makes that tricky to do. */
1884
1885static SCM
1886normalize_str (SCM string, uninorm_t form)
1887{
1888 SCM ret;
1889 scm_t_uint32 *w_str;
1890 scm_t_wchar *cbuf;
1891 size_t rlen, len = scm_i_string_length (string);
1892
1893 if (scm_i_is_narrow_string (string))
1894 {
1895 size_t i;
1896 const char *buf = scm_i_string_chars (string);
1897
1898 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1899
1900 for (i = 0; i < len; i ++)
1901 w_str[i] = (unsigned char) buf[i];
1902 w_str[len] = 0;
1903 }
d8164b04
JG
1904 else
1905 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1906
edb7bb47
JG
1907 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1908
1909 ret = scm_i_make_wide_string (rlen, &cbuf);
1910 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1911 free (w_str);
d8164b04
JG
1912
1913 scm_i_try_narrow_string (ret);
1914
edb7bb47
JG
1915 return ret;
1916}
1917
1918SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1919 (SCM string),
1920 "Returns the NFC normalized form of @var{string}.")
1921#define FUNC_NAME s_scm_string_normalize_nfc
1922{
1923 SCM_VALIDATE_STRING (1, string);
1924 return normalize_str (string, UNINORM_NFC);
1925}
1926#undef FUNC_NAME
1927
1928SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1929 (SCM string),
1930 "Returns the NFD normalized form of @var{string}.")
1931#define FUNC_NAME s_scm_string_normalize_nfd
1932{
1933 SCM_VALIDATE_STRING (1, string);
1934 return normalize_str (string, UNINORM_NFD);
1935}
1936#undef FUNC_NAME
1937
1938SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1939 (SCM string),
1940 "Returns the NFKC normalized form of @var{string}.")
1941#define FUNC_NAME s_scm_string_normalize_nfkc
1942{
1943 SCM_VALIDATE_STRING (1, string);
1944 return normalize_str (string, UNINORM_NFKC);
1945}
1946#undef FUNC_NAME
1947
1948SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1949 (SCM string),
1950 "Returns the NFKD normalized form of @var{string}.")
1951#define FUNC_NAME s_scm_string_normalize_nfkd
1952{
1953 SCM_VALIDATE_STRING (1, string);
1954 return normalize_str (string, UNINORM_NFKD);
1955}
1956#undef FUNC_NAME
1957
3ee86942
MV
1958/* converts C scm_array of strings to SCM scm_list of strings. */
1959/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1960SCM
3ee86942
MV
1961scm_makfromstrs (int argc, char **argv)
1962{
1963 int i = argc;
1964 SCM lst = SCM_EOL;
1965 if (0 > i)
1966 for (i = 0; argv[i]; i++);
1967 while (i--)
1968 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1969 return lst;
1970}
1971
c829a427
MV
1972/* Return a newly allocated array of char pointers to each of the strings
1973 in args, with a terminating NULL pointer. */
1974
1975char **
1976scm_i_allocate_string_pointers (SCM list)
2a776823 1977#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 1978{
c829a427
MV
1979 char **result;
1980 int len = scm_ilength (list);
1981 int i;
1982
1983 if (len < 0)
1984 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1985
2a776823
LC
1986 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1987 "string pointers");
c829a427 1988 result[len] = NULL;
c829a427
MV
1989
1990 /* The list might be have been modified in another thread, so
1991 we check LIST before each access.
1992 */
d2e53ed6 1993 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427 1994 {
2a776823
LC
1995 SCM str;
1996 size_t len;
1997
1998 str = SCM_CAR (list);
1999 len = scm_c_string_length (str);
2000
2001 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
2002 memcpy (result[i], scm_i_string_chars (str), len);
2003 result[i][len] = '\0';
2004
c829a427
MV
2005 list = SCM_CDR (list);
2006 }
2007
c829a427 2008 return result;
af68e5e5 2009}
2a776823 2010#undef FUNC_NAME
24933780 2011
6f14f578
MV
2012void
2013scm_i_get_substring_spec (size_t len,
2014 SCM start, size_t *cstart,
2015 SCM end, size_t *cend)
2016{
2017 if (SCM_UNBNDP (start))
2018 *cstart = 0;
2019 else
2020 *cstart = scm_to_unsigned_integer (start, 0, len);
2021
2022 if (SCM_UNBNDP (end))
2023 *cend = len;
2024 else
2025 *cend = scm_to_unsigned_integer (end, *cstart, len);
2026}
2027
3ee86942
MV
2028#if SCM_ENABLE_DEPRECATED
2029
556d75db
MV
2030/* When these definitions are removed, it becomes reasonable to use
2031 read-only strings for string literals. For that, change the reader
2032 to create string literals with scm_c_substring_read_only instead of
2033 with scm_c_substring_copy.
2034*/
2035
3ee86942 2036int
fe78c51a 2037scm_i_deprecated_stringp (SCM str)
3ee86942
MV
2038{
2039 scm_c_issue_deprecation_warning
2040 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2041
2616f0e0 2042 return scm_is_string (str);
3ee86942
MV
2043}
2044
2045char *
fe78c51a 2046scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2047{
2048 char *chars;
2049
2050 scm_c_issue_deprecation_warning
2051 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2052
2616f0e0
MV
2053 /* We don't accept shared substrings here since they are not
2054 null-terminated.
2055 */
2056 if (IS_SH_STRING (str))
c291b588
LC
2057 scm_misc_error (NULL,
2058 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2059 SCM_EOL);
2060
877f06c3 2061 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2062 error message.
2063 */
2064
2065 if (IS_RO_STRING (str))
c291b588
LC
2066 scm_misc_error (NULL,
2067 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2068 SCM_EOL);
c291b588 2069
2616f0e0 2070 /* The following is still wrong, of course...
3ee86942 2071 */
9c44cd45 2072 str = scm_i_string_start_writing (str);
3ee86942
MV
2073 chars = scm_i_string_writable_chars (str);
2074 scm_i_string_stop_writing ();
2075 return chars;
2076}
2077
2078size_t
fe78c51a 2079scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2080{
2081 scm_c_issue_deprecation_warning
2082 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2083 return scm_c_string_length (str);
2084}
2085
2086#endif
2087
2a610be5
AW
2088static SCM
2089string_handle_ref (scm_t_array_handle *h, size_t index)
2090{
2091 return scm_c_string_ref (h->array, index);
2092}
2093
2094static void
2095string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2096{
2097 scm_c_string_set_x (h->array, index, val);
2098}
2099
2100static void
2101string_get_handle (SCM v, scm_t_array_handle *h)
2102{
2103 h->array = v;
2104 h->ndims = 1;
2105 h->dims = &h->dim0;
2106 h->dim0.lbnd = 0;
2107 h->dim0.ubnd = scm_c_string_length (v) - 1;
2108 h->dim0.inc = 1;
2109 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2110 h->elements = h->writable_elements = NULL;
2111}
2112
c5f17102 2113SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2114 string_handle_ref, string_handle_set,
f65e0168
LC
2115 string_get_handle)
2116SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2117
0f2d19dd
JB
2118void
2119scm_init_strings ()
0f2d19dd 2120{
3ee86942 2121 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 2122
a0599745 2123#include "libguile/strings.x"
0f2d19dd
JB
2124}
2125
89e00824
ML
2126
2127/*
2128 Local Variables:
2129 c-file-style: "gnu"
2130 End:
2131*/