Bump version number for 1.9.9.
[bpt/guile.git] / libguile / strings.c
CommitLineData
29bcdbb0 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45 27#include <ctype.h>
edb7bb47 28#include <uninorm.h>
9c44cd45 29#include <unistr.h>
eca29b02 30#include <uniconv.h>
faf2c9d7 31
889975e5
MG
32#include "striconveh.h"
33
a0599745
MD
34#include "libguile/_scm.h"
35#include "libguile/chars.h"
7c33806a 36#include "libguile/root.h"
a0599745 37#include "libguile/strings.h"
a3d7d5d5 38#include "libguile/error.h"
f45eccff 39#include "libguile/generalized-vectors.h"
1afff620 40#include "libguile/deprecation.h"
a0599745 41#include "libguile/validate.h"
d31b9519 42#include "libguile/private-options.h"
1afff620 43
0f2d19dd
JB
44\f
45
46/* {Strings}
47 */
48
3ee86942
MV
49
50/* Stringbufs
51 *
52 * XXX - keeping an accurate refcount during GC seems to be quite
53 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 54 * shared, not whether it definitely is.
3ee86942
MV
55 *
56 * The scheme I (mvo) tried to keep an accurate reference count would
57 * recount all strings that point to a stringbuf during the mark-phase
58 * of the GC. This was done since one cannot access the stringbuf of
59 * a string when that string is freed (in order to decrease the
60 * reference count). The memory of the stringbuf might have been
61 * reused already for something completely different.
62 *
63 * This recounted worked for a small number of threads beating on
64 * cow-strings, but it failed randomly with more than 10 threads, say.
65 * I couldn't figure out what went wrong, so I used the conservative
66 * approach implemented below.
50b1996f 67 *
ba54a202
LC
68 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
69 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
70 * strings.
3ee86942
MV
71 */
72
ba54a202
LC
73/* The size in words of the stringbuf header (type tag + size). */
74#define STRINGBUF_HEADER_SIZE 2U
75
76#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
77
35920c00 78#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 79#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
80
81#define STRINGBUF_TAG scm_tc7_stringbuf
82#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 83#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 84
ba54a202
LC
85#define STRINGBUF_CHARS(buf) ((unsigned char *) \
86 SCM_CELL_OBJECT_LOC (buf, \
87 STRINGBUF_HEADER_SIZE))
88#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 89
ba54a202 90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CHARS (buf))
3ee86942 91
9b41542f
LC
92#define SET_STRINGBUF_SHARED(buf) \
93 do \
94 { \
95 /* Don't modify BUF if it's already marked as shared since it might be \
96 a read-only, statically allocated stringbuf. */ \
97 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
98 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
99 } \
100 while (0)
3ee86942 101
56a3dcd4 102#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
103static size_t lenhist[1001];
104#endif
105
50b1996f
MG
106/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
107 characters. */
3ee86942
MV
108static SCM
109make_stringbuf (size_t len)
0f2d19dd 110{
3ee86942
MV
111 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
112 scm_i_symbol_chars, all stringbufs are null-terminated. Once
113 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
114 has been changed for scm_i_symbol_chars, this null-termination
115 can be dropped.
116 */
117
ba54a202
LC
118 SCM buf;
119
56a3dcd4 120#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
121 if (len < 1000)
122 lenhist[len]++;
123 else
124 lenhist[1000]++;
125#endif
0f2d19dd 126
ba54a202
LC
127 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
128 "string"));
129
130 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
131 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
132
133 STRINGBUF_CHARS (buf)[len] = 0;
134
135 return buf;
3ee86942 136}
e53cc817 137
50b1996f
MG
138/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
139 characters. */
9c44cd45
MG
140static SCM
141make_wide_stringbuf (size_t len)
142{
ba54a202
LC
143 SCM buf;
144 size_t raw_len;
145
56a3dcd4 146#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
147 if (len < 1000)
148 lenhist[len]++;
149 else
150 lenhist[1000]++;
151#endif
152
ba54a202
LC
153 raw_len = (len + 1) * sizeof (scm_t_wchar);
154 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
155 "string"));
156
157 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
158 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
159
160 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
161
162 return buf;
9c44cd45
MG
163}
164
ba54a202
LC
165/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
166 characters from BUF. */
167static SCM
168wide_stringbuf (SCM buf)
9c44cd45 169{
ba54a202 170 SCM new_buf;
9c44cd45
MG
171
172 if (STRINGBUF_WIDE (buf))
ba54a202
LC
173 new_buf = buf;
174 else
9c44cd45 175 {
ba54a202
LC
176 size_t i, len;
177 scm_t_wchar *mem;
9c44cd45 178
ba54a202 179 len = STRINGBUF_LENGTH (buf);
9c44cd45 180
ba54a202 181 new_buf = make_wide_stringbuf (len);
9c44cd45 182
ba54a202 183 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 184 for (i = 0; i < len; i++)
ba54a202 185 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 186 mem[len] = 0;
9c44cd45 187 }
ba54a202
LC
188
189 return new_buf;
3ee86942 190}
bd9e24b3 191
ba54a202
LC
192/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
193 characters from BUF, if possible. */
194static SCM
587a3355
MG
195narrow_stringbuf (SCM buf)
196{
ba54a202 197 SCM new_buf;
587a3355
MG
198
199 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
200 new_buf = buf;
201 else
202 {
203 size_t i, len;
204 scm_t_wchar *wmem;
205 unsigned char *mem;
587a3355 206
ba54a202
LC
207 len = STRINGBUF_LENGTH (buf);
208 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 209
ba54a202
LC
210 for (i = 0; i < len; i++)
211 if (wmem[i] > 0xFF)
212 /* BUF cannot be narrowed. */
213 return buf;
587a3355 214
ba54a202 215 new_buf = make_stringbuf (len);
587a3355 216
ba54a202
LC
217 mem = STRINGBUF_CHARS (new_buf);
218 for (i = 0; i < len; i++)
219 mem[i] = (unsigned char) wmem[i];
220 mem[len] = 0;
221 }
222
223 return new_buf;
587a3355
MG
224}
225
9de87eea 226scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 227
ba54a202 228\f
3ee86942
MV
229/* Copy-on-write strings.
230 */
bd9e24b3 231
3ee86942 232#define STRING_TAG scm_tc7_string
bd9e24b3 233
3ee86942
MV
234#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
235#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
236#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 237
3ee86942
MV
238#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
239#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
240
241#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
242
ed35de72
MV
243/* Read-only strings.
244 */
245
35920c00 246#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
247#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
248
e1b29f6a
MV
249/* Mutation-sharing substrings
250 */
251
252#define SH_STRING_TAG (scm_tc7_string + 0x100)
253
254#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
255/* START and LENGTH as for STRINGs. */
256
257#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
258
e7efe8e7
AW
259SCM scm_nullstr;
260
50b1996f
MG
261/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
262 characters. CHARSP, if not NULL, will be set to location of the
263 char array. */
3ee86942
MV
264SCM
265scm_i_make_string (size_t len, char **charsp)
266{
267 SCM buf = make_stringbuf (len);
268 SCM res;
269 if (charsp)
f59cf998 270 *charsp = (char *) STRINGBUF_CHARS (buf);
3ee86942
MV
271 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
272 (scm_t_bits)0, (scm_t_bits) len);
273 return res;
0f2d19dd
JB
274}
275
50b1996f
MG
276/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
277 characters. CHARSP, if not NULL, will be set to location of the
278 character array. */
9c44cd45 279SCM
50b1996f 280scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
281{
282 SCM buf = make_wide_stringbuf (len);
283 SCM res;
284 if (charsp)
285 *charsp = STRINGBUF_WIDE_CHARS (buf);
286 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
287 (scm_t_bits) 0, (scm_t_bits) len);
288 return res;
289}
290
3ee86942
MV
291static void
292validate_substring_args (SCM str, size_t start, size_t end)
293{
294 if (!IS_STRING (str))
295 scm_wrong_type_arg_msg (NULL, 0, str, "string");
296 if (start > STRING_LENGTH (str))
297 scm_out_of_range (NULL, scm_from_size_t (start));
298 if (end > STRING_LENGTH (str) || end < start)
299 scm_out_of_range (NULL, scm_from_size_t (end));
300}
0f2d19dd 301
e1b29f6a
MV
302static inline void
303get_str_buf_start (SCM *str, SCM *buf, size_t *start)
304{
305 *start = STRING_START (*str);
306 if (IS_SH_STRING (*str))
307 {
308 *str = SH_STRING_STRING (*str);
309 *start += STRING_START (*str);
310 }
311 *buf = STRING_STRINGBUF (*str);
312}
313
3ee86942
MV
314SCM
315scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 316{
e1b29f6a
MV
317 SCM buf;
318 size_t str_start;
319 get_str_buf_start (&str, &buf, &str_start);
9de87eea 320 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 321 SET_STRINGBUF_SHARED (buf);
9de87eea 322 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 323 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
324 (scm_t_bits)str_start + start,
325 (scm_t_bits) end - start);
0f2d19dd
JB
326}
327
ed35de72
MV
328SCM
329scm_i_substring_read_only (SCM str, size_t start, size_t end)
330{
45a9f430
LC
331 SCM buf;
332 size_t str_start;
333 get_str_buf_start (&str, &buf, &str_start);
334 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
335 SET_STRINGBUF_SHARED (buf);
336 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
337 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
338 (scm_t_bits)str_start + start,
339 (scm_t_bits) end - start);
ed35de72
MV
340}
341
3ee86942
MV
342SCM
343scm_i_substring_copy (SCM str, size_t start, size_t end)
344{
345 size_t len = end - start;
edea856c 346 SCM buf, my_buf;
e1b29f6a
MV
347 size_t str_start;
348 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
349 if (scm_i_is_narrow_string (str))
350 {
351 my_buf = make_stringbuf (len);
352 memcpy (STRINGBUF_CHARS (my_buf),
353 STRINGBUF_CHARS (buf) + str_start + start, len);
354 }
355 else
356 {
357 my_buf = make_wide_stringbuf (len);
358 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
359 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
360 + start), len);
361 /* Even though this string is wide, the substring may be narrow.
50b1996f 362 Consider adding code to narrow the string. */
9c44cd45 363 }
3ee86942 364 scm_remember_upto_here_1 (buf);
9c44cd45
MG
365 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
366 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 367}
0f2d19dd 368
e1b29f6a
MV
369SCM
370scm_i_substring_shared (SCM str, size_t start, size_t end)
371{
372 if (start == 0 && end == STRING_LENGTH (str))
373 return str;
374 else
375 {
376 size_t len = end - start;
377 if (IS_SH_STRING (str))
378 {
379 start += STRING_START (str);
380 str = SH_STRING_STRING (str);
381 }
382 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
383 (scm_t_bits)start, (scm_t_bits) len);
384 }
385}
386
3ee86942
MV
387SCM
388scm_c_substring (SCM str, size_t start, size_t end)
389{
390 validate_substring_args (str, start, end);
391 return scm_i_substring (str, start, end);
392}
ee149d03 393
ed35de72
MV
394SCM
395scm_c_substring_read_only (SCM str, size_t start, size_t end)
396{
397 validate_substring_args (str, start, end);
398 return scm_i_substring_read_only (str, start, end);
399}
400
0f2d19dd 401SCM
3ee86942 402scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 403{
3ee86942
MV
404 validate_substring_args (str, start, end);
405 return scm_i_substring_copy (str, start, end);
406}
407
3ee86942
MV
408SCM
409scm_c_substring_shared (SCM str, size_t start, size_t end)
410{
411 validate_substring_args (str, start, end);
412 return scm_i_substring_shared (str, start, end);
413}
0f2d19dd 414
d6c74168 415\f
3ee86942
MV
416/* Internal accessors
417 */
418
50b1996f
MG
419/* Returns the number of characters in STR. This may be different
420 than the memory size of the string storage. */
3ee86942
MV
421size_t
422scm_i_string_length (SCM str)
0f2d19dd 423{
3ee86942 424 return STRING_LENGTH (str);
0f2d19dd
JB
425}
426
50b1996f
MG
427/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
428 encoding. False if it is 'wide', having a 32-bit UCS-4
429 encoding. */
9c44cd45
MG
430int
431scm_i_is_narrow_string (SCM str)
432{
433 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
434}
435
587a3355
MG
436/* Try to coerce a string to be narrow. It if is narrow already, do
437 nothing. If it is wide, shrink it to narrow if none of its
438 characters are above 0xFF. Return true if the string is narrow or
439 was made to be narrow. */
440int
441scm_i_try_narrow_string (SCM str)
442{
ba54a202 443 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
444
445 return scm_i_is_narrow_string (str);
446}
447
50b1996f
MG
448/* Returns a pointer to the 8-bit Latin-1 encoded character array of
449 STR. */
3ee86942
MV
450const char *
451scm_i_string_chars (SCM str)
452{
453 SCM buf;
e1b29f6a
MV
454 size_t start;
455 get_str_buf_start (&str, &buf, &start);
9c44cd45 456 if (scm_i_is_narrow_string (str))
f59cf998 457 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
458 else
459 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
460 scm_list_1 (str));
461 return NULL;
3ee86942 462}
b00418df 463
50b1996f
MG
464/* Returns a pointer to the 32-bit UCS-4 encoded character array of
465 STR. */
9c44cd45
MG
466const scm_t_wchar *
467scm_i_string_wide_chars (SCM str)
468{
469 SCM buf;
470 size_t start;
471
472 get_str_buf_start (&str, &buf, &start);
473 if (!scm_i_is_narrow_string (str))
f59cf998 474 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
475 else
476 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
477 scm_list_1 (str));
478}
479
480/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
481 a new string buffer, so that it can be modified without modifying
50b1996f
MG
482 other strings. Also, lock the string mutex. Later, one must call
483 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
484SCM
485scm_i_string_start_writing (SCM orig_str)
b00418df 486{
ed35de72 487 SCM buf, str = orig_str;
e1b29f6a 488 size_t start;
ed35de72 489
e1b29f6a 490 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
491 if (IS_RO_STRING (str))
492 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
493
9de87eea 494 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
495 if (STRINGBUF_SHARED (buf))
496 {
9c44cd45 497 /* Clone the stringbuf. */
3ee86942
MV
498 size_t len = STRING_LENGTH (str);
499 SCM new_buf;
500
9de87eea 501 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 502
9c44cd45
MG
503 if (scm_i_is_narrow_string (str))
504 {
505 new_buf = make_stringbuf (len);
506 memcpy (STRINGBUF_CHARS (new_buf),
507 STRINGBUF_CHARS (buf) + STRING_START (str), len);
508
509 }
510 else
511 {
512 new_buf = make_wide_stringbuf (len);
513 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
514 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
515 + STRING_START (str)), len);
516 }
3ee86942 517
3ee86942
MV
518 SET_STRING_STRINGBUF (str, new_buf);
519 start -= STRING_START (str);
902578f1
LC
520
521 /* FIXME: The following operations are not atomic, so other threads
522 looking at STR may see an inconsistent state. Nevertheless it can't
523 hurt much since (i) accessing STR while it is being mutated can't
524 yield a crash, and (ii) concurrent accesses to STR should be
525 protected by a mutex at the application level. The latter may not
526 apply when STR != ORIG_STR, though. */
3ee86942 527 SET_STRING_START (str, 0);
902578f1 528 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
529
530 buf = new_buf;
531
9de87eea 532 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 533 }
9c44cd45
MG
534 return orig_str;
535}
3ee86942 536
50b1996f 537/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
538char *
539scm_i_string_writable_chars (SCM str)
540{
541 SCM buf;
542 size_t start;
3ee86942 543
9c44cd45
MG
544 get_str_buf_start (&str, &buf, &start);
545 if (scm_i_is_narrow_string (str))
f59cf998 546 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
547 else
548 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
549 scm_list_1 (str));
550 return NULL;
b00418df
DH
551}
552
50b1996f 553/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
554static scm_t_wchar *
555scm_i_string_writable_wide_chars (SCM str)
556{
557 SCM buf;
558 size_t start;
559
560 get_str_buf_start (&str, &buf, &start);
561 if (!scm_i_is_narrow_string (str))
562 return STRINGBUF_WIDE_CHARS (buf) + start;
563 else
1c7b216f 564 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 565 scm_list_1 (str));
b00418df
DH
566}
567
50b1996f
MG
568/* Unlock the string mutex that was locked when
569 scm_i_string_start_writing was called. */
3ee86942
MV
570void
571scm_i_string_stop_writing (void)
572{
9de87eea 573 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 574}
b00418df 575
50b1996f 576/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
577scm_t_wchar
578scm_i_string_ref (SCM str, size_t x)
579{
580 if (scm_i_is_narrow_string (str))
581 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
582 else
583 return scm_i_string_wide_chars (str)[x];
584}
585
889975e5
MG
586/* Returns index+1 of the first char in STR that matches C, or
587 0 if the char is not found. */
588int
589scm_i_string_contains_char (SCM str, char ch)
590{
591 size_t i;
592 size_t len = scm_i_string_length (str);
593
594 i = 0;
595 if (scm_i_is_narrow_string (str))
596 {
597 while (i < len)
598 {
599 if (scm_i_string_chars (str)[i] == ch)
600 return i+1;
601 i++;
602 }
603 }
604 else
605 {
606 while (i < len)
607 {
608 if (scm_i_string_wide_chars (str)[i]
609 == (unsigned char) ch)
610 return i+1;
611 i++;
612 }
613 }
614 return 0;
615}
616
3f47e526
MG
617int
618scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
619{
620 if (scm_i_is_narrow_string (sstr))
621 {
622 const char *a = scm_i_string_chars (sstr) + start_x;
623 const char *b = cstr;
624 return strncmp (a, b, strlen(b));
625 }
626 else
627 {
628 size_t i;
629 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
630 const char *b = cstr;
631 for (i = 0; i < strlen (b); i++)
632 {
633 if (a[i] != (unsigned char) b[i])
634 return 1;
635 }
636 }
637 return 0;
638}
639
50b1996f 640/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
641void
642scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
643{
644 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 645 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
646
647 if (scm_i_is_narrow_string (str))
648 {
649 char *dst = scm_i_string_writable_chars (str);
587a3355 650 dst[p] = chr;
9c44cd45
MG
651 }
652 else
653 {
654 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
655 dst[p] = chr;
656 }
657}
658
ba54a202 659\f
3ee86942 660/* Symbols.
587a3355 661
3ee86942
MV
662 Basic symbol creation and accessing is done here, the rest is in
663 symbols.[hc]. This has been done to keep stringbufs and the
664 internals of strings and string-like objects confined to this file.
665*/
666
667#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
668
669SCM
6869328b
MV
670scm_i_make_symbol (SCM name, scm_t_bits flags,
671 unsigned long hash, SCM props)
3ee86942
MV
672{
673 SCM buf;
674 size_t start = STRING_START (name);
675 size_t length = STRING_LENGTH (name);
676
677 if (IS_SH_STRING (name))
678 {
679 name = SH_STRING_STRING (name);
680 start += STRING_START (name);
681 }
682 buf = SYMBOL_STRINGBUF (name);
683
684 if (start == 0 && length == STRINGBUF_LENGTH (buf))
685 {
686 /* reuse buf. */
9de87eea 687 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 688 SET_STRINGBUF_SHARED (buf);
9de87eea 689 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
690 }
691 else
692 {
693 /* make new buf. */
9c44cd45
MG
694 if (scm_i_is_narrow_string (name))
695 {
696 SCM new_buf = make_stringbuf (length);
697 memcpy (STRINGBUF_CHARS (new_buf),
698 STRINGBUF_CHARS (buf) + start, length);
699 buf = new_buf;
700 }
701 else
702 {
703 SCM new_buf = make_wide_stringbuf (length);
704 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
705 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
706 length);
707 buf = new_buf;
708 }
3ee86942 709 }
6869328b 710 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
711 (scm_t_bits) hash, SCM_UNPACK (props));
712}
713
fd0a5bbc
HWN
714SCM
715scm_i_c_make_symbol (const char *name, size_t len,
716 scm_t_bits flags, unsigned long hash, SCM props)
717{
718 SCM buf = make_stringbuf (len);
719 memcpy (STRINGBUF_CHARS (buf), name, len);
720
a284cc7e
LC
721 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
722 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
723}
724
50b1996f
MG
725/* Returns the number of characters in SYM. This may be different
726 from the memory size of SYM. */
3ee86942
MV
727size_t
728scm_i_symbol_length (SCM sym)
0f2d19dd 729{
3ee86942 730 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
731}
732
071bb6a8
LC
733size_t
734scm_c_symbol_length (SCM sym)
735#define FUNC_NAME "scm_c_symbol_length"
736{
737 SCM_VALIDATE_SYMBOL (1, sym);
738
739 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
740}
741#undef FUNC_NAME
742
50b1996f
MG
743/* True if the name of SYM is stored as a Latin-1 encoded string.
744 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
745int
746scm_i_is_narrow_symbol (SCM sym)
747{
748 SCM buf;
749
750 buf = SYMBOL_STRINGBUF (sym);
751 return !STRINGBUF_WIDE (buf);
752}
753
50b1996f
MG
754/* Returns a pointer to the 8-bit Latin-1 encoded character array that
755 contains the name of SYM. */
3ee86942
MV
756const char *
757scm_i_symbol_chars (SCM sym)
758{
9c44cd45
MG
759 SCM buf;
760
761 buf = SYMBOL_STRINGBUF (sym);
762 if (!STRINGBUF_WIDE (buf))
f59cf998 763 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
764 else
765 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
766 scm_list_1 (sym));
767}
768
50b1996f
MG
769/* Return a pointer to the 32-bit UCS-4-encoded character array of a
770 symbol's name. */
9c44cd45
MG
771const scm_t_wchar *
772scm_i_symbol_wide_chars (SCM sym)
773{
774 SCM buf;
775
776 buf = SYMBOL_STRINGBUF (sym);
777 if (STRINGBUF_WIDE (buf))
f59cf998 778 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
779 else
780 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
781 scm_list_1 (sym));
3ee86942 782}
1cc91f1b 783
be54b15d 784SCM
3ee86942 785scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 786{
3ee86942 787 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 788 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 789 SET_STRINGBUF_SHARED (buf);
9de87eea 790 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 791 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
792 (scm_t_bits)start, (scm_t_bits) end - start);
793}
be54b15d 794
50b1996f 795/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
796scm_t_wchar
797scm_i_symbol_ref (SCM sym, size_t x)
798{
799 if (scm_i_is_narrow_symbol (sym))
800 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
801 else
802 return scm_i_symbol_wide_chars (sym)[x];
803}
804
3ee86942
MV
805/* Debugging
806 */
be54b15d 807
6ce6923b
MG
808SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
809 "Returns an association list containing debugging information\n"
810 "for @var{str}. The association list has the following entries."
811 "@table @code\n"
812 "@item string\n"
813 "The string itself.\n"
814 "@item start\n"
815 "The start index of the string into its stringbuf\n"
816 "@item length\n"
817 "The length of the string\n"
818 "@item shared\n"
819 "If this string is a substring, it returns its parent string.\n"
820 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
821 "@item read-only\n"
822 "@code{#t} if the string is read-only\n"
6ce6923b
MG
823 "@item stringbuf-chars\n"
824 "A new string containing this string's stringbuf's characters\n"
825 "@item stringbuf-length\n"
826 "The number of characters in this stringbuf\n"
827 "@item stringbuf-shared\n"
828 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
829 "@item stringbuf-wide\n"
830 "@code{#t} if this stringbuf's characters are stored in a\n"
831 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
832 "buffer\n"
833 "@end table")
3ee86942
MV
834#define FUNC_NAME s_scm_sys_string_dump
835{
ba54a202 836 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 837 SCM buf;
3ee86942 838 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
839
840 /* String info */
841 e1 = scm_cons (scm_from_locale_symbol ("string"),
842 str);
843 e2 = scm_cons (scm_from_locale_symbol ("start"),
844 scm_from_size_t (STRING_START (str)));
845 e3 = scm_cons (scm_from_locale_symbol ("length"),
846 scm_from_size_t (STRING_LENGTH (str)));
847
3ee86942
MV
848 if (IS_SH_STRING (str))
849 {
6ce6923b
MG
850 e4 = scm_cons (scm_from_locale_symbol ("shared"),
851 SH_STRING_STRING (str));
852 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
853 }
854 else
855 {
6ce6923b
MG
856 e4 = scm_cons (scm_from_locale_symbol ("shared"),
857 SCM_BOOL_F);
858 buf = STRING_STRINGBUF (str);
3ee86942 859 }
9c44cd45 860
88ed5759
MG
861 if (IS_RO_STRING (str))
862 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
863 SCM_BOOL_T);
864 else
865 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
866 SCM_BOOL_F);
587a3355 867
6ce6923b 868 /* Stringbuf info */
6ce6923b
MG
869 if (!STRINGBUF_WIDE (buf))
870 {
871 size_t len = STRINGBUF_LENGTH (buf);
872 char *cbuf;
873 SCM sbc = scm_i_make_string (len, &cbuf);
874 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
875 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
876 sbc);
3ee86942 877 }
6ce6923b
MG
878 else
879 {
880 size_t len = STRINGBUF_LENGTH (buf);
881 scm_t_wchar *cbuf;
882 SCM sbc = scm_i_make_wide_string (len, &cbuf);
883 u32_cpy ((scm_t_uint32 *) cbuf,
884 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
885 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
886 sbc);
887 }
888 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
889 scm_from_size_t (STRINGBUF_LENGTH (buf)));
890 if (STRINGBUF_SHARED (buf))
891 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
892 SCM_BOOL_T);
893 else
894 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
895 SCM_BOOL_F);
6ce6923b 896 if (STRINGBUF_WIDE (buf))
ba54a202
LC
897 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
898 SCM_BOOL_T);
6ce6923b 899 else
ba54a202
LC
900 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
901 SCM_BOOL_F);
6ce6923b 902
ba54a202 903 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
904}
905#undef FUNC_NAME
906
6ce6923b
MG
907SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
908 "Returns an association list containing debugging information\n"
909 "for @var{sym}. The association list has the following entries."
910 "@table @code\n"
911 "@item symbol\n"
912 "The symbol itself\n"
913 "@item hash\n"
914 "Its hash value\n"
88ed5759
MG
915 "@item interned\n"
916 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
917 "@item stringbuf-chars\n"
918 "A new string containing this symbols's stringbuf's characters\n"
919 "@item stringbuf-length\n"
920 "The number of characters in this stringbuf\n"
921 "@item stringbuf-shared\n"
922 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
923 "@item stringbuf-wide\n"
924 "@code{#t} if this stringbuf's characters are stored in a\n"
925 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
926 "buffer\n"
927 "@end table")
3ee86942
MV
928#define FUNC_NAME s_scm_sys_symbol_dump
929{
ba54a202 930 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 931 SCM buf;
3ee86942 932 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
933 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
934 sym);
935 e2 = scm_cons (scm_from_locale_symbol ("hash"),
936 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
937 e3 = scm_cons (scm_from_locale_symbol ("interned"),
938 scm_symbol_interned_p (sym));
6ce6923b
MG
939 buf = SYMBOL_STRINGBUF (sym);
940
941 /* Stringbuf info */
6ce6923b
MG
942 if (!STRINGBUF_WIDE (buf))
943 {
944 size_t len = STRINGBUF_LENGTH (buf);
945 char *cbuf;
946 SCM sbc = scm_i_make_string (len, &cbuf);
947 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
948 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
949 sbc);
950 }
9c44cd45 951 else
6ce6923b
MG
952 {
953 size_t len = STRINGBUF_LENGTH (buf);
954 scm_t_wchar *cbuf;
955 SCM sbc = scm_i_make_wide_string (len, &cbuf);
956 u32_cpy ((scm_t_uint32 *) cbuf,
957 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
958 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
959 sbc);
960 }
961 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
962 scm_from_size_t (STRINGBUF_LENGTH (buf)));
963 if (STRINGBUF_SHARED (buf))
964 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
965 SCM_BOOL_T);
966 else
967 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
968 SCM_BOOL_F);
6ce6923b 969 if (STRINGBUF_WIDE (buf))
ba54a202 970 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b
MG
971 SCM_BOOL_T);
972 else
ba54a202 973 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b 974 SCM_BOOL_F);
ba54a202 975 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 976
3ee86942
MV
977}
978#undef FUNC_NAME
979
56a3dcd4 980#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 981
9c44cd45 982SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 983#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
984{
985 int i;
986 for (i = 0; i < 1000; i++)
987 if (lenhist[i])
988 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
989 fprintf (stderr, ">999: %u\n", lenhist[1000]);
990 return SCM_UNSPECIFIED;
be54b15d
DH
991}
992#undef FUNC_NAME
993
3ee86942
MV
994#endif
995
996\f
997
998SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
999 (SCM obj),
1000 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1001#define FUNC_NAME s_scm_string_p
1002{
1003 return scm_from_bool (IS_STRING (obj));
1004}
1005#undef FUNC_NAME
1006
1007
1008SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1009
1010SCM_DEFINE (scm_string, "string", 0, 0, 1,
1011 (SCM chrs),
1012 "@deffnx {Scheme Procedure} list->string chrs\n"
1013 "Return a newly allocated string composed of the arguments,\n"
1014 "@var{chrs}.")
1015#define FUNC_NAME s_scm_string
1016{
9aa27c1a 1017 SCM result = SCM_BOOL_F;
9c44cd45 1018 SCM rest;
3ee86942 1019 size_t len;
9c44cd45
MG
1020 size_t p = 0;
1021 long i;
9aa27c1a 1022 int wide = 0;
3ee86942 1023
9c44cd45
MG
1024 /* Verify that this is a list of chars. */
1025 i = scm_ilength (chrs);
3c7cf7f5 1026 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1027
9c44cd45
MG
1028 len = (size_t) i;
1029 rest = chrs;
3ee86942 1030
9c44cd45 1031 while (len > 0 && scm_is_pair (rest))
3ee86942 1032 {
9c44cd45 1033 SCM elt = SCM_CAR (rest);
3ee86942 1034 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1035 if (SCM_CHAR (elt) > 0xFF)
1036 wide = 1;
9c44cd45
MG
1037 rest = SCM_CDR (rest);
1038 len--;
1039 scm_remember_upto_here_1 (elt);
1040 }
1041
1042 /* Construct a string containing this list of chars. */
1043 len = (size_t) i;
1044 rest = chrs;
1045
9aa27c1a 1046 if (wide == 0)
9c44cd45 1047 {
56a3dcd4
LC
1048 char *buf;
1049
9aa27c1a
MG
1050 result = scm_i_make_string (len, NULL);
1051 result = scm_i_string_start_writing (result);
56a3dcd4 1052 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1053 while (len > 0 && scm_is_pair (rest))
1054 {
1055 SCM elt = SCM_CAR (rest);
1056 buf[p] = (unsigned char) SCM_CHAR (elt);
1057 p++;
1058 rest = SCM_CDR (rest);
1059 len--;
1060 scm_remember_upto_here_1 (elt);
1061 }
1062 }
1063 else
1064 {
56a3dcd4
LC
1065 scm_t_wchar *buf;
1066
9aa27c1a
MG
1067 result = scm_i_make_wide_string (len, NULL);
1068 result = scm_i_string_start_writing (result);
56a3dcd4 1069 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1070 while (len > 0 && scm_is_pair (rest))
1071 {
1072 SCM elt = SCM_CAR (rest);
1073 buf[p] = SCM_CHAR (elt);
1074 p++;
1075 rest = SCM_CDR (rest);
1076 len--;
1077 scm_remember_upto_here_1 (elt);
1078 }
3ee86942 1079 }
9c44cd45
MG
1080 scm_i_string_stop_writing ();
1081
3ee86942
MV
1082 if (len > 0)
1083 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1084 if (!scm_is_null (rest))
3ee86942
MV
1085 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1086
1087 return result;
1088}
1089#undef FUNC_NAME
be54b15d 1090
3b3b36dd 1091SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1092 (SCM k, SCM chr),
0d26a824
MG
1093 "Return a newly allocated string of\n"
1094 "length @var{k}. If @var{chr} is given, then all elements of\n"
1095 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1096 "of the @var{string} are unspecified.")
1bbd0b84 1097#define FUNC_NAME s_scm_make_string
0f2d19dd 1098{
3ee86942
MV
1099 return scm_c_make_string (scm_to_size_t (k), chr);
1100}
1101#undef FUNC_NAME
1102
1103SCM
1104scm_c_make_string (size_t len, SCM chr)
1105#define FUNC_NAME NULL
1106{
9c44cd45
MG
1107 size_t p;
1108 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1109
e11e83f3
MV
1110 if (!SCM_UNBNDP (chr))
1111 {
3ee86942 1112 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1113 res = scm_i_string_start_writing (res);
1114 for (p = 0; p < len; p++)
1115 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1116 scm_i_string_stop_writing ();
0f2d19dd 1117 }
e11e83f3
MV
1118
1119 return res;
0f2d19dd 1120}
1bbd0b84 1121#undef FUNC_NAME
0f2d19dd 1122
3b3b36dd 1123SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1124 (SCM string),
1125 "Return the number of characters in @var{string}.")
1bbd0b84 1126#define FUNC_NAME s_scm_string_length
0f2d19dd 1127{
d1ca2c64 1128 SCM_VALIDATE_STRING (1, string);
3ee86942 1129 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1130}
1bbd0b84 1131#undef FUNC_NAME
0f2d19dd 1132
f8ba2bb9 1133SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1134 (SCM string),
1135 "Return the bytes used to represent a character in @var{string}."
1136 "This will return 1 or 4.")
f8ba2bb9 1137#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1138{
1139 SCM_VALIDATE_STRING (1, string);
1140 if (!scm_i_is_narrow_string (string))
1141 return scm_from_int (4);
1142
1143 return scm_from_int (1);
1144}
1145#undef FUNC_NAME
1146
3ee86942
MV
1147size_t
1148scm_c_string_length (SCM string)
1149{
1150 if (!IS_STRING (string))
1151 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1152 return STRING_LENGTH (string);
1153}
1154
bd9e24b3 1155SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1156 (SCM str, SCM k),
9c44cd45
MG
1157 "Return character @var{k} of @var{str} using zero-origin\n"
1158 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1159#define FUNC_NAME s_scm_string_ref
0f2d19dd 1160{
3ae3166b 1161 size_t len;
a55c2b68 1162 unsigned long idx;
bd9e24b3 1163
d1ca2c64 1164 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1165
1166 len = scm_i_string_length (str);
1167 if (SCM_LIKELY (len > 0))
1168 idx = scm_to_unsigned_integer (k, 0, len - 1);
1169 else
1170 scm_out_of_range (NULL, k);
1171
9c44cd45
MG
1172 if (scm_i_is_narrow_string (str))
1173 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1174 else
1175 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1176}
1bbd0b84 1177#undef FUNC_NAME
0f2d19dd 1178
3ee86942
MV
1179SCM
1180scm_c_string_ref (SCM str, size_t p)
1181{
1182 if (p >= scm_i_string_length (str))
1183 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1184 if (scm_i_is_narrow_string (str))
1185 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1186 else
1187 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1188
3ee86942 1189}
f0942910 1190
3b3b36dd 1191SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1192 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1193 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1194 "an unspecified value. @var{k} must be a valid index of\n"
1195 "@var{str}.")
1bbd0b84 1196#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1197{
3ae3166b 1198 size_t len;
a55c2b68
MV
1199 unsigned long idx;
1200
f0942910 1201 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1202
1203 len = scm_i_string_length (str);
1204 if (SCM_LIKELY (len > 0))
1205 idx = scm_to_unsigned_integer (k, 0, len - 1);
1206 else
1207 scm_out_of_range (NULL, k);
1208
34d19ef6 1209 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1210 str = scm_i_string_start_writing (str);
1211 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1212 scm_i_string_stop_writing ();
1213
0f2d19dd
JB
1214 return SCM_UNSPECIFIED;
1215}
1bbd0b84 1216#undef FUNC_NAME
0f2d19dd 1217
3ee86942
MV
1218void
1219scm_c_string_set_x (SCM str, size_t p, SCM chr)
1220{
1221 if (p >= scm_i_string_length (str))
1222 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1223 str = scm_i_string_start_writing (str);
1224 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1225 scm_i_string_stop_writing ();
3ee86942 1226}
0f2d19dd 1227
3b3b36dd 1228SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1229 (SCM str, SCM start, SCM end),
1230 "Return a newly allocated string formed from the characters\n"
1231 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1232 "ending with index @var{end} (exclusive).\n"
1233 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1234 "exact integers satisfying:\n\n"
1235 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1236#define FUNC_NAME s_scm_substring
0f2d19dd 1237{
3ee86942 1238 size_t len, from, to;
685c0d71 1239
d1ca2c64 1240 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1241 len = scm_i_string_length (str);
1242 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1243 if (SCM_UNBNDP (end))
3ee86942 1244 to = len;
a55c2b68 1245 else
3ee86942
MV
1246 to = scm_to_unsigned_integer (end, from, len);
1247 return scm_i_substring (str, from, to);
0f2d19dd 1248}
1bbd0b84 1249#undef FUNC_NAME
0f2d19dd 1250
ed35de72
MV
1251SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1252 (SCM str, SCM start, SCM end),
1253 "Return a newly allocated string formed from the characters\n"
1254 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1255 "ending with index @var{end} (exclusive).\n"
1256 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1257 "exact integers satisfying:\n"
1258 "\n"
1259 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1260 "\n"
1261 "The returned string is read-only.\n")
1262#define FUNC_NAME s_scm_substring_read_only
1263{
1264 size_t len, from, to;
1265
1266 SCM_VALIDATE_STRING (1, str);
1267 len = scm_i_string_length (str);
1268 from = scm_to_unsigned_integer (start, 0, len);
1269 if (SCM_UNBNDP (end))
1270 to = len;
1271 else
1272 to = scm_to_unsigned_integer (end, from, len);
1273 return scm_i_substring_read_only (str, from, to);
1274}
1275#undef FUNC_NAME
1276
3ee86942
MV
1277SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1278 (SCM str, SCM start, SCM end),
1279 "Return a newly allocated string formed from the characters\n"
1280 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1281 "ending with index @var{end} (exclusive).\n"
1282 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1283 "exact integers satisfying:\n\n"
1284 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1285#define FUNC_NAME s_scm_substring_copy
1286{
e1b29f6a
MV
1287 /* For the Scheme version, START is mandatory, but for the C
1288 version, it is optional. See scm_string_copy in srfi-13.c for a
1289 rationale.
1290 */
1291
1292 size_t from, to;
3ee86942
MV
1293
1294 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1295 scm_i_get_substring_spec (scm_i_string_length (str),
1296 start, &from, end, &to);
3ee86942
MV
1297 return scm_i_substring_copy (str, from, to);
1298}
1299#undef FUNC_NAME
1300
1301SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1302 (SCM str, SCM start, SCM end),
1303 "Return string that indirectly refers to the characters\n"
1304 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1305 "ending with index @var{end} (exclusive).\n"
1306 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1307 "exact integers satisfying:\n\n"
1308 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1309#define FUNC_NAME s_scm_substring_shared
1310{
1311 size_t len, from, to;
1312
1313 SCM_VALIDATE_STRING (1, str);
1314 len = scm_i_string_length (str);
1315 from = scm_to_unsigned_integer (start, 0, len);
1316 if (SCM_UNBNDP (end))
1317 to = len;
1318 else
1319 to = scm_to_unsigned_integer (end, from, len);
1320 return scm_i_substring_shared (str, from, to);
1321}
1322#undef FUNC_NAME
685c0d71 1323
3b3b36dd 1324SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1325 (SCM args),
9c44cd45 1326 "Return a newly allocated string whose characters form the\n"
0d26a824 1327 "concatenation of the given strings, @var{args}.")
1bbd0b84 1328#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1329{
1330 SCM res;
9c44cd45
MG
1331 size_t len = 0;
1332 int wide = 0;
c829a427 1333 SCM l, s;
bd4911ef 1334 size_t i;
9909c395
MG
1335 union
1336 {
1337 char *narrow;
1338 scm_t_wchar *wide;
1339 } data;
af45e3b0
DH
1340
1341 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1342 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1343 {
1344 s = SCM_CAR (l);
1345 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1346 len += scm_i_string_length (s);
1347 if (!scm_i_is_narrow_string (s))
1348 wide = 1;
c829a427 1349 }
9909c395 1350 data.narrow = NULL;
9c44cd45 1351 if (!wide)
9909c395 1352 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1353 else
9909c395 1354 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1355
1356 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1357 {
edea856c 1358 size_t len;
c829a427 1359 s = SCM_CAR (l);
3ee86942 1360 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1361 len = scm_i_string_length (s);
9c44cd45
MG
1362 if (!wide)
1363 {
9909c395
MG
1364 memcpy (data.narrow, scm_i_string_chars (s), len);
1365 data.narrow += len;
9c44cd45
MG
1366 }
1367 else
1368 {
1369 if (scm_i_is_narrow_string (s))
1370 {
1371 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1372 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1373 }
1374 else
9909c395 1375 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1376 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1377 data.wide += len;
9c44cd45 1378 }
c829a427
MV
1379 scm_remember_upto_here_1 (s);
1380 }
0f2d19dd
JB
1381 return res;
1382}
1bbd0b84 1383#undef FUNC_NAME
0f2d19dd 1384
c829a427
MV
1385int
1386scm_is_string (SCM obj)
1387{
3ee86942 1388 return IS_STRING (obj);
c829a427 1389}
24933780 1390
a3d7d5d5
LC
1391\f
1392/* Conversion to/from other encodings. */
1393
1394SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1395static void
ef7e4ba3
LC
1396scm_encoding_error (const char *subr, int err, const char *message,
1397 const char *from, const char *to, SCM string_or_bv)
1398{
1399 /* Raise an exception that conveys all the information needed to debug the
1400 problem. Only perform locale conversions that are safe; in particular,
1401 don't try to display STRING_OR_BV when it's a string since converting it to
1402 the output locale may fail. */
1403 scm_throw (scm_encoding_error_key,
1404 scm_list_n (scm_from_locale_string (subr),
1405 scm_from_locale_string (message),
1406 scm_from_int (err),
1407 scm_from_locale_string (from),
1408 scm_from_locale_string (to),
1409 string_or_bv,
1410 SCM_UNDEFINED));
a3d7d5d5
LC
1411}
1412
fac32b51 1413SCM
587a3355
MG
1414scm_from_stringn (const char *str, size_t len, const char *encoding,
1415 scm_t_string_failed_conversion_handler handler)
1416{
1417 size_t u32len, i;
1418 scm_t_wchar *u32;
1419 int wide = 0;
1420 SCM res;
1421
fac32b51
MG
1422 if (len == 0)
1423 return scm_nullstr;
1424
889975e5
MG
1425 if (encoding == NULL)
1426 {
1427 /* If encoding is null, use Latin-1. */
1428 char *buf;
1429 res = scm_i_make_string (len, &buf);
1430 memcpy (buf, str, len);
1431 return res;
1432 }
1433
587a3355
MG
1434 u32len = 0;
1435 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1436 (enum iconv_ilseq_handler)
1437 handler,
1438 str, len,
1439 NULL,
1440 NULL, &u32len);
1441
ef7e4ba3 1442 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1443 {
ef7e4ba3
LC
1444 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1445 handler. */
1446 SCM bv;
1447 signed char *buf;
1448
1449 buf = scm_gc_malloc_pointerless (len, "bytevector");
1450 memcpy (buf, str, len);
1451 bv = scm_c_take_bytevector (buf, len);
1452
1453 scm_encoding_error (__func__, errno,
1454 "input locale conversion error",
1455 encoding, "UTF-32", bv);
587a3355
MG
1456 }
1457
1458 i = 0;
1459 while (i < u32len)
1460 if (u32[i++] > 0xFF)
1461 {
1462 wide = 1;
1463 break;
1464 }
1465
1466 if (!wide)
1467 {
1468 char *dst;
1469 res = scm_i_make_string (u32len, &dst);
1470 for (i = 0; i < u32len; i ++)
1471 dst[i] = (unsigned char) u32[i];
1472 dst[u32len] = '\0';
1473 }
1474 else
1475 {
1476 scm_t_wchar *wdst;
1477 res = scm_i_make_wide_string (u32len, &wdst);
1478 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1479 wdst[u32len] = 0;
1480 }
1481
1482 free (u32);
1483 return res;
1484}
1485
c829a427
MV
1486SCM
1487scm_from_locale_stringn (const char *str, size_t len)
1488{
889975e5
MG
1489 const char *enc;
1490 scm_t_string_failed_conversion_handler hndl;
1491 SCM inport;
1492 scm_t_port *pt;
4d4528e7 1493
9c44cd45 1494 if (len == (size_t) -1)
c829a427 1495 len = strlen (str);
9c44cd45
MG
1496 if (len == 0)
1497 return scm_nullstr;
1498
889975e5
MG
1499 inport = scm_current_input_port ();
1500 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1501 {
1502 pt = SCM_PTAB_ENTRY (inport);
1503 enc = pt->encoding;
1504 hndl = pt->ilseq_handler;
1505 }
1506 else
1507 {
1508 enc = NULL;
1509 hndl = SCM_FAILED_CONVERSION_ERROR;
1510 }
1511
1512 return scm_from_stringn (str, len, enc, hndl);
c829a427 1513}
4d4528e7 1514
c829a427
MV
1515SCM
1516scm_from_locale_string (const char *str)
4d4528e7 1517{
9c44cd45
MG
1518 if (str == NULL)
1519 return scm_nullstr;
1520
c829a427
MV
1521 return scm_from_locale_stringn (str, -1);
1522}
4d4528e7 1523
587a3355
MG
1524SCM
1525scm_i_from_utf8_string (const scm_t_uint8 *str)
1526{
1527 return scm_from_stringn ((const char *) str,
1528 strlen ((char *) str), "UTF-8",
1529 SCM_FAILED_CONVERSION_ERROR);
1530}
1531
50b1996f
MG
1532/* Create a new scheme string from the C string STR. The memory of
1533 STR may be used directly as storage for the new string. */
13a94556
LC
1534/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1535 would be to register a finalizer to eventually free(3) STR, which isn't
1536 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1537SCM
1538scm_take_locale_stringn (char *str, size_t len)
1539{
13a94556 1540 SCM res;
48ddf0d9 1541
13a94556
LC
1542 res = scm_from_locale_stringn (str, len);
1543 free (str);
c829a427 1544
c829a427
MV
1545 return res;
1546}
1547
48ddf0d9
KR
1548SCM
1549scm_take_locale_string (char *str)
1550{
1551 return scm_take_locale_stringn (str, -1);
1552}
1553
9c44cd45
MG
1554/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1555 and \UXXXXXX. */
1556static void
1557unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1558{
1559 char *before, *after;
1560 size_t i, j;
1561
1562 before = *bufp;
1563 after = *bufp;
1564 i = 0;
1565 j = 0;
1566 while (i < *lenp)
1567 {
1568 if ((i <= *lenp - 6)
1569 && before[i] == '\\'
1570 && before[i + 1] == 'u'
1571 && before[i + 2] == '0' && before[i + 3] == '0')
1572 {
1573 /* Convert \u00NN to \xNN */
1574 after[j] = '\\';
1575 after[j + 1] = 'x';
30a6b9ca
MG
1576 after[j + 2] = tolower ((int) before[i + 4]);
1577 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1578 i += 6;
1579 j += 4;
1580 }
1581 else if ((i <= *lenp - 10)
1582 && before[i] == '\\'
1583 && before[i + 1] == 'U'
1584 && before[i + 2] == '0' && before[i + 3] == '0')
1585 {
1586 /* Convert \U00NNNNNN to \UNNNNNN */
1587 after[j] = '\\';
1588 after[j + 1] = 'U';
30a6b9ca
MG
1589 after[j + 2] = tolower ((int) before[i + 4]);
1590 after[j + 3] = tolower ((int) before[i + 5]);
1591 after[j + 4] = tolower ((int) before[i + 6]);
1592 after[j + 5] = tolower ((int) before[i + 7]);
1593 after[j + 6] = tolower ((int) before[i + 8]);
1594 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1595 i += 10;
1596 j += 8;
1597 }
1598 else
1599 {
1600 after[j] = before[i];
1601 i++;
1602 j++;
1603 }
1604 }
1605 *lenp = j;
1606 after = scm_realloc (after, j);
1607}
1608
d31b9519
MG
1609/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXXXX; */
1610static void
1611unistring_escapes_to_r6rs_escapes (char **bufp, size_t *lenp)
1612{
1613 char *before, *after;
1614 size_t i, j;
1615 /* The worst case is if the input string contains all 4-digit hex escapes.
1616 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1617 size_t max_out_len = (*lenp * 7) / 6 + 1;
1618 size_t nzeros, ndigits;
1619
1620 before = *bufp;
1621 after = alloca (max_out_len);
1622 i = 0;
1623 j = 0;
1624 while (i < *lenp)
1625 {
1626 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1627 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1628 {
1629 if (before[i + 1] == 'u')
1630 ndigits = 4;
1631 else if (before[i + 1] == 'U')
1632 ndigits = 8;
1633 else
1634 abort ();
1635
1636 /* Add the R6RS hex escape initial sequence. */
1637 after[j] = '\\';
1638 after[j + 1] = 'x';
1639
1640 /* Move string positions to the start of the hex numbers. */
1641 i += 2;
1642 j += 2;
1643
1644 /* Find the number of initial zeros in this hex number. */
1645 nzeros = 0;
1646 while (before[i + nzeros] == '0' && nzeros < ndigits)
1647 nzeros++;
1648
1649 /* Copy the number, skipping initial zeros, and then move the string
1650 positions. */
1651 if (nzeros == ndigits)
1652 {
1653 after[j] = '0';
1654 i += ndigits;
1655 j += 1;
1656 }
1657 else
1658 {
1659 int pos;
1660 for (pos = 0; pos < ndigits - nzeros; pos++)
1661 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1662 i += ndigits;
1663 j += (ndigits - nzeros);
1664 }
1665
1666 /* Add terminating semicolon. */
1667 after[j] = ';';
1668 j++;
1669 }
1670 else
1671 {
1672 after[j] = before[i];
1673 i++;
1674 j++;
1675 }
1676 }
1677 *lenp = j;
1678 before = scm_realloc (before, j);
1679 memcpy (before, after, j);
1680}
1681
1682
c829a427 1683char *
fac32b51 1684scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1685{
889975e5
MG
1686 SCM outport;
1687 scm_t_port *pt;
9c44cd45
MG
1688 const char *enc;
1689
889975e5
MG
1690 outport = scm_current_output_port ();
1691 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1692 {
1693 pt = SCM_PTAB_ENTRY (outport);
1694 enc = pt->encoding;
1695 }
1696 else
1697 enc = NULL;
9c44cd45 1698
889975e5
MG
1699 return scm_to_stringn (str, lenp,
1700 enc,
1701 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1702}
1703
29bcdbb0
LC
1704/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1705 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1706 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1707 defined by HANDLER. */
9c44cd45 1708char *
587a3355 1709scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1710 scm_t_string_failed_conversion_handler handler)
9c44cd45 1711{
9c44cd45
MG
1712 char *buf;
1713 size_t ilen, len, i;
889975e5
MG
1714 int ret;
1715 const char *enc;
4d4528e7 1716
3ee86942 1717 if (!scm_is_string (str))
c829a427 1718 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1719 ilen = scm_i_string_length (str);
1720
1721 if (ilen == 0)
1722 {
1723 buf = scm_malloc (1);
1724 buf[0] = '\0';
1725 if (lenp)
1726 *lenp = 0;
1727 return buf;
1728 }
587a3355 1729
c829a427 1730 if (lenp == NULL)
9c44cd45
MG
1731 for (i = 0; i < ilen; i++)
1732 if (scm_i_string_ref (str, i) == '\0')
1733 scm_misc_error (NULL,
1734 "string contains #\\nul character: ~S",
1735 scm_list_1 (str));
1736
889975e5 1737 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1738 {
889975e5
MG
1739 /* If using native Latin-1 encoding, just copy the string
1740 contents. */
9c44cd45
MG
1741 if (lenp)
1742 {
1743 buf = scm_malloc (ilen);
1744 memcpy (buf, scm_i_string_chars (str), ilen);
1745 *lenp = ilen;
1746 return buf;
1747 }
1748 else
1749 {
1750 buf = scm_malloc (ilen + 1);
1751 memcpy (buf, scm_i_string_chars (str), ilen);
1752 buf[ilen] = '\0';
1753 return buf;
1754 }
c829a427 1755 }
9c44cd45 1756
587a3355 1757
9c44cd45
MG
1758 buf = NULL;
1759 len = 0;
889975e5
MG
1760 enc = encoding;
1761 if (enc == NULL)
1762 enc = "ISO-8859-1";
1763 if (scm_i_is_narrow_string (str))
1764 {
1765 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1766 "ISO-8859-1", enc,
1767 (enum iconv_ilseq_handler) handler, NULL,
1768 &buf, &len);
9c44cd45 1769
889975e5 1770 if (ret != 0)
ef7e4ba3
LC
1771 scm_encoding_error (__func__, errno,
1772 "cannot convert to output locale",
1773 "ISO-8859-1", enc, str);
889975e5
MG
1774 }
1775 else
1776 {
d31b9519 1777 buf = u32_conv_to_encoding (enc,
889975e5 1778 (enum iconv_ilseq_handler) handler,
d31b9519 1779 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
1780 ilen,
1781 NULL,
1782 NULL, &len);
1783 if (buf == NULL)
ef7e4ba3
LC
1784 scm_encoding_error (__func__, errno,
1785 "cannot convert to output locale",
1786 "UTF-32", enc, str);
d31b9519
MG
1787 }
1788 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1789 {
1790 if (SCM_R6RS_ESCAPES_P)
1791 unistring_escapes_to_r6rs_escapes (&buf, &len);
1792 else
5f5920e0 1793 unistring_escapes_to_guile_escapes (&buf, &len);
889975e5 1794 }
9c44cd45 1795 if (lenp)
4d4528e7 1796 *lenp = len;
9c44cd45
MG
1797 else
1798 {
1799 buf = scm_realloc (buf, len + 1);
1800 buf[len] = '\0';
1801 }
24933780 1802
c829a427 1803 scm_remember_upto_here_1 (str);
9c44cd45 1804 return buf;
4d4528e7 1805}
af68e5e5 1806
c829a427
MV
1807char *
1808scm_to_locale_string (SCM str)
1809{
1810 return scm_to_locale_stringn (str, NULL);
1811}
af68e5e5 1812
587a3355
MG
1813scm_t_uint8 *
1814scm_i_to_utf8_string (SCM str)
1815{
1816 char *u8str;
1817 u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1818 return (scm_t_uint8 *) u8str;
1819}
1820
c829a427
MV
1821size_t
1822scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1823{
1824 size_t len;
9c44cd45 1825 char *result = NULL;
3ee86942 1826 if (!scm_is_string (str))
c829a427 1827 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1828 result = scm_to_locale_stringn (str, &len);
1829
1830 memcpy (buf, result, (len > max_len) ? max_len : len);
1831 free (result);
1832
c829a427
MV
1833 scm_remember_upto_here_1 (str);
1834 return len;
1835}
af68e5e5 1836
a3d7d5d5
LC
1837\f
1838/* Unicode string normalization. */
1839
edb7bb47
JG
1840/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1841 libguile/i18n.c. It would be useful to have this factored out into a more
1842 convenient location, but its use of alloca makes that tricky to do. */
1843
1844static SCM
1845normalize_str (SCM string, uninorm_t form)
1846{
1847 SCM ret;
1848 scm_t_uint32 *w_str;
1849 scm_t_wchar *cbuf;
1850 size_t rlen, len = scm_i_string_length (string);
1851
1852 if (scm_i_is_narrow_string (string))
1853 {
1854 size_t i;
1855 const char *buf = scm_i_string_chars (string);
1856
1857 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1858
1859 for (i = 0; i < len; i ++)
1860 w_str[i] = (unsigned char) buf[i];
1861 w_str[len] = 0;
1862 }
d8164b04
JG
1863 else
1864 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1865
edb7bb47
JG
1866 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1867
1868 ret = scm_i_make_wide_string (rlen, &cbuf);
1869 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1870 free (w_str);
d8164b04
JG
1871
1872 scm_i_try_narrow_string (ret);
1873
edb7bb47
JG
1874 return ret;
1875}
1876
1877SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1878 (SCM string),
1879 "Returns the NFC normalized form of @var{string}.")
1880#define FUNC_NAME s_scm_string_normalize_nfc
1881{
1882 SCM_VALIDATE_STRING (1, string);
1883 return normalize_str (string, UNINORM_NFC);
1884}
1885#undef FUNC_NAME
1886
1887SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1888 (SCM string),
1889 "Returns the NFD normalized form of @var{string}.")
1890#define FUNC_NAME s_scm_string_normalize_nfd
1891{
1892 SCM_VALIDATE_STRING (1, string);
1893 return normalize_str (string, UNINORM_NFD);
1894}
1895#undef FUNC_NAME
1896
1897SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1898 (SCM string),
1899 "Returns the NFKC normalized form of @var{string}.")
1900#define FUNC_NAME s_scm_string_normalize_nfkc
1901{
1902 SCM_VALIDATE_STRING (1, string);
1903 return normalize_str (string, UNINORM_NFKC);
1904}
1905#undef FUNC_NAME
1906
1907SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1908 (SCM string),
1909 "Returns the NFKD normalized form of @var{string}.")
1910#define FUNC_NAME s_scm_string_normalize_nfkd
1911{
1912 SCM_VALIDATE_STRING (1, string);
1913 return normalize_str (string, UNINORM_NFKD);
1914}
1915#undef FUNC_NAME
1916
3ee86942
MV
1917/* converts C scm_array of strings to SCM scm_list of strings. */
1918/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1919SCM
3ee86942
MV
1920scm_makfromstrs (int argc, char **argv)
1921{
1922 int i = argc;
1923 SCM lst = SCM_EOL;
1924 if (0 > i)
1925 for (i = 0; argv[i]; i++);
1926 while (i--)
1927 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1928 return lst;
1929}
1930
c829a427
MV
1931/* Return a newly allocated array of char pointers to each of the strings
1932 in args, with a terminating NULL pointer. */
1933
1934char **
1935scm_i_allocate_string_pointers (SCM list)
2a776823 1936#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 1937{
c829a427
MV
1938 char **result;
1939 int len = scm_ilength (list);
1940 int i;
1941
1942 if (len < 0)
1943 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1944
2a776823
LC
1945 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1946 "string pointers");
c829a427 1947 result[len] = NULL;
c829a427
MV
1948
1949 /* The list might be have been modified in another thread, so
1950 we check LIST before each access.
1951 */
d2e53ed6 1952 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427 1953 {
2a776823
LC
1954 SCM str;
1955 size_t len;
1956
1957 str = SCM_CAR (list);
1958 len = scm_c_string_length (str);
1959
1960 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1961 memcpy (result[i], scm_i_string_chars (str), len);
1962 result[i][len] = '\0';
1963
c829a427
MV
1964 list = SCM_CDR (list);
1965 }
1966
c829a427 1967 return result;
af68e5e5 1968}
2a776823 1969#undef FUNC_NAME
24933780 1970
6f14f578
MV
1971void
1972scm_i_get_substring_spec (size_t len,
1973 SCM start, size_t *cstart,
1974 SCM end, size_t *cend)
1975{
1976 if (SCM_UNBNDP (start))
1977 *cstart = 0;
1978 else
1979 *cstart = scm_to_unsigned_integer (start, 0, len);
1980
1981 if (SCM_UNBNDP (end))
1982 *cend = len;
1983 else
1984 *cend = scm_to_unsigned_integer (end, *cstart, len);
1985}
1986
3ee86942
MV
1987#if SCM_ENABLE_DEPRECATED
1988
556d75db
MV
1989/* When these definitions are removed, it becomes reasonable to use
1990 read-only strings for string literals. For that, change the reader
1991 to create string literals with scm_c_substring_read_only instead of
1992 with scm_c_substring_copy.
1993*/
1994
3ee86942 1995int
fe78c51a 1996scm_i_deprecated_stringp (SCM str)
3ee86942
MV
1997{
1998 scm_c_issue_deprecation_warning
1999 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2000
2616f0e0 2001 return scm_is_string (str);
3ee86942
MV
2002}
2003
2004char *
fe78c51a 2005scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2006{
2007 char *chars;
2008
2009 scm_c_issue_deprecation_warning
2010 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2011
2616f0e0
MV
2012 /* We don't accept shared substrings here since they are not
2013 null-terminated.
2014 */
2015 if (IS_SH_STRING (str))
c291b588
LC
2016 scm_misc_error (NULL,
2017 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2018 SCM_EOL);
2019
877f06c3 2020 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2021 error message.
2022 */
2023
2024 if (IS_RO_STRING (str))
c291b588
LC
2025 scm_misc_error (NULL,
2026 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2027 SCM_EOL);
c291b588 2028
2616f0e0 2029 /* The following is still wrong, of course...
3ee86942 2030 */
9c44cd45 2031 str = scm_i_string_start_writing (str);
3ee86942
MV
2032 chars = scm_i_string_writable_chars (str);
2033 scm_i_string_stop_writing ();
2034 return chars;
2035}
2036
2037size_t
fe78c51a 2038scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2039{
2040 scm_c_issue_deprecation_warning
2041 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2042 return scm_c_string_length (str);
2043}
2044
2045#endif
2046
2a610be5
AW
2047static SCM
2048string_handle_ref (scm_t_array_handle *h, size_t index)
2049{
2050 return scm_c_string_ref (h->array, index);
2051}
2052
2053static void
2054string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2055{
2056 scm_c_string_set_x (h->array, index, val);
2057}
2058
2059static void
2060string_get_handle (SCM v, scm_t_array_handle *h)
2061{
2062 h->array = v;
2063 h->ndims = 1;
2064 h->dims = &h->dim0;
2065 h->dim0.lbnd = 0;
2066 h->dim0.ubnd = scm_c_string_length (v) - 1;
2067 h->dim0.inc = 1;
2068 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2069 h->elements = h->writable_elements = NULL;
2070}
2071
c5f17102 2072SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2073 string_handle_ref, string_handle_set,
f65e0168
LC
2074 string_get_handle)
2075SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2076
0f2d19dd
JB
2077void
2078scm_init_strings ()
0f2d19dd 2079{
3ee86942 2080 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 2081
a0599745 2082#include "libguile/strings.x"
0f2d19dd
JB
2083}
2084
89e00824
ML
2085
2086/*
2087 Local Variables:
2088 c-file-style: "gnu"
2089 End:
2090*/