Revert "Install `standard-library.info'."
[bpt/guile.git] / libguile / strings.c
CommitLineData
29bcdbb0 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45 27#include <ctype.h>
edb7bb47 28#include <uninorm.h>
9c44cd45 29#include <unistr.h>
eca29b02 30#include <uniconv.h>
faf2c9d7 31
889975e5
MG
32#include "striconveh.h"
33
a0599745
MD
34#include "libguile/_scm.h"
35#include "libguile/chars.h"
7c33806a 36#include "libguile/root.h"
a0599745 37#include "libguile/strings.h"
a3d7d5d5 38#include "libguile/error.h"
f45eccff 39#include "libguile/generalized-vectors.h"
1afff620 40#include "libguile/deprecation.h"
a0599745 41#include "libguile/validate.h"
1afff620 42
0f2d19dd
JB
43\f
44
45/* {Strings}
46 */
47
3ee86942
MV
48
49/* Stringbufs
50 *
51 * XXX - keeping an accurate refcount during GC seems to be quite
52 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 53 * shared, not whether it definitely is.
3ee86942
MV
54 *
55 * The scheme I (mvo) tried to keep an accurate reference count would
56 * recount all strings that point to a stringbuf during the mark-phase
57 * of the GC. This was done since one cannot access the stringbuf of
58 * a string when that string is freed (in order to decrease the
59 * reference count). The memory of the stringbuf might have been
60 * reused already for something completely different.
61 *
62 * This recounted worked for a small number of threads beating on
63 * cow-strings, but it failed randomly with more than 10 threads, say.
64 * I couldn't figure out what went wrong, so I used the conservative
65 * approach implemented below.
50b1996f 66 *
ba54a202
LC
67 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
68 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
69 * strings.
3ee86942
MV
70 */
71
ba54a202
LC
72/* The size in words of the stringbuf header (type tag + size). */
73#define STRINGBUF_HEADER_SIZE 2U
74
75#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
76
35920c00 77#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 78#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
79
80#define STRINGBUF_TAG scm_tc7_stringbuf
81#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 82#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 83
ba54a202
LC
84#define STRINGBUF_CHARS(buf) ((unsigned char *) \
85 SCM_CELL_OBJECT_LOC (buf, \
86 STRINGBUF_HEADER_SIZE))
87#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 88
ba54a202 89#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CHARS (buf))
3ee86942 90
9b41542f
LC
91#define SET_STRINGBUF_SHARED(buf) \
92 do \
93 { \
94 /* Don't modify BUF if it's already marked as shared since it might be \
95 a read-only, statically allocated stringbuf. */ \
96 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
97 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
98 } \
99 while (0)
3ee86942 100
56a3dcd4 101#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
102static size_t lenhist[1001];
103#endif
104
50b1996f
MG
105/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
106 characters. */
3ee86942
MV
107static SCM
108make_stringbuf (size_t len)
0f2d19dd 109{
3ee86942
MV
110 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
111 scm_i_symbol_chars, all stringbufs are null-terminated. Once
112 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
113 has been changed for scm_i_symbol_chars, this null-termination
114 can be dropped.
115 */
116
ba54a202
LC
117 SCM buf;
118
56a3dcd4 119#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
120 if (len < 1000)
121 lenhist[len]++;
122 else
123 lenhist[1000]++;
124#endif
0f2d19dd 125
ba54a202
LC
126 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
127 "string"));
128
129 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
130 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
131
132 STRINGBUF_CHARS (buf)[len] = 0;
133
134 return buf;
3ee86942 135}
e53cc817 136
50b1996f
MG
137/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
138 characters. */
9c44cd45
MG
139static SCM
140make_wide_stringbuf (size_t len)
141{
ba54a202
LC
142 SCM buf;
143 size_t raw_len;
144
56a3dcd4 145#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
146 if (len < 1000)
147 lenhist[len]++;
148 else
149 lenhist[1000]++;
150#endif
151
ba54a202
LC
152 raw_len = (len + 1) * sizeof (scm_t_wchar);
153 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
154 "string"));
155
156 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
157 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
158
159 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
160
161 return buf;
9c44cd45
MG
162}
163
ba54a202
LC
164/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
165 characters from BUF. */
166static SCM
167wide_stringbuf (SCM buf)
9c44cd45 168{
ba54a202 169 SCM new_buf;
9c44cd45
MG
170
171 if (STRINGBUF_WIDE (buf))
ba54a202
LC
172 new_buf = buf;
173 else
9c44cd45 174 {
ba54a202
LC
175 size_t i, len;
176 scm_t_wchar *mem;
9c44cd45 177
ba54a202 178 len = STRINGBUF_LENGTH (buf);
9c44cd45 179
ba54a202 180 new_buf = make_wide_stringbuf (len);
9c44cd45 181
ba54a202 182 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 183 for (i = 0; i < len; i++)
ba54a202 184 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 185 mem[len] = 0;
9c44cd45 186 }
ba54a202
LC
187
188 return new_buf;
3ee86942 189}
bd9e24b3 190
ba54a202
LC
191/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
192 characters from BUF, if possible. */
193static SCM
587a3355
MG
194narrow_stringbuf (SCM buf)
195{
ba54a202 196 SCM new_buf;
587a3355
MG
197
198 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
199 new_buf = buf;
200 else
201 {
202 size_t i, len;
203 scm_t_wchar *wmem;
204 unsigned char *mem;
587a3355 205
ba54a202
LC
206 len = STRINGBUF_LENGTH (buf);
207 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 208
ba54a202
LC
209 for (i = 0; i < len; i++)
210 if (wmem[i] > 0xFF)
211 /* BUF cannot be narrowed. */
212 return buf;
587a3355 213
ba54a202 214 new_buf = make_stringbuf (len);
587a3355 215
ba54a202
LC
216 mem = STRINGBUF_CHARS (new_buf);
217 for (i = 0; i < len; i++)
218 mem[i] = (unsigned char) wmem[i];
219 mem[len] = 0;
220 }
221
222 return new_buf;
587a3355
MG
223}
224
9de87eea 225scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 226
ba54a202 227\f
3ee86942
MV
228/* Copy-on-write strings.
229 */
bd9e24b3 230
3ee86942 231#define STRING_TAG scm_tc7_string
bd9e24b3 232
3ee86942
MV
233#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
234#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
235#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 236
3ee86942
MV
237#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
238#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
239
240#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
241
ed35de72
MV
242/* Read-only strings.
243 */
244
35920c00 245#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
246#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
247
e1b29f6a
MV
248/* Mutation-sharing substrings
249 */
250
251#define SH_STRING_TAG (scm_tc7_string + 0x100)
252
253#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
254/* START and LENGTH as for STRINGs. */
255
256#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
257
e7efe8e7
AW
258SCM scm_nullstr;
259
50b1996f
MG
260/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
261 characters. CHARSP, if not NULL, will be set to location of the
262 char array. */
3ee86942
MV
263SCM
264scm_i_make_string (size_t len, char **charsp)
265{
266 SCM buf = make_stringbuf (len);
267 SCM res;
268 if (charsp)
f59cf998 269 *charsp = (char *) STRINGBUF_CHARS (buf);
3ee86942
MV
270 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
271 (scm_t_bits)0, (scm_t_bits) len);
272 return res;
0f2d19dd
JB
273}
274
50b1996f
MG
275/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
276 characters. CHARSP, if not NULL, will be set to location of the
277 character array. */
9c44cd45 278SCM
50b1996f 279scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
280{
281 SCM buf = make_wide_stringbuf (len);
282 SCM res;
283 if (charsp)
284 *charsp = STRINGBUF_WIDE_CHARS (buf);
285 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
286 (scm_t_bits) 0, (scm_t_bits) len);
287 return res;
288}
289
3ee86942
MV
290static void
291validate_substring_args (SCM str, size_t start, size_t end)
292{
293 if (!IS_STRING (str))
294 scm_wrong_type_arg_msg (NULL, 0, str, "string");
295 if (start > STRING_LENGTH (str))
296 scm_out_of_range (NULL, scm_from_size_t (start));
297 if (end > STRING_LENGTH (str) || end < start)
298 scm_out_of_range (NULL, scm_from_size_t (end));
299}
0f2d19dd 300
e1b29f6a
MV
301static inline void
302get_str_buf_start (SCM *str, SCM *buf, size_t *start)
303{
304 *start = STRING_START (*str);
305 if (IS_SH_STRING (*str))
306 {
307 *str = SH_STRING_STRING (*str);
308 *start += STRING_START (*str);
309 }
310 *buf = STRING_STRINGBUF (*str);
311}
312
3ee86942
MV
313SCM
314scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 315{
e1b29f6a
MV
316 SCM buf;
317 size_t str_start;
318 get_str_buf_start (&str, &buf, &str_start);
9de87eea 319 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 320 SET_STRINGBUF_SHARED (buf);
9de87eea 321 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 322 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
323 (scm_t_bits)str_start + start,
324 (scm_t_bits) end - start);
0f2d19dd
JB
325}
326
ed35de72
MV
327SCM
328scm_i_substring_read_only (SCM str, size_t start, size_t end)
329{
45a9f430
LC
330 SCM buf;
331 size_t str_start;
332 get_str_buf_start (&str, &buf, &str_start);
333 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
334 SET_STRINGBUF_SHARED (buf);
335 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
336 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
337 (scm_t_bits)str_start + start,
338 (scm_t_bits) end - start);
ed35de72
MV
339}
340
3ee86942
MV
341SCM
342scm_i_substring_copy (SCM str, size_t start, size_t end)
343{
344 size_t len = end - start;
edea856c 345 SCM buf, my_buf;
e1b29f6a
MV
346 size_t str_start;
347 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
348 if (scm_i_is_narrow_string (str))
349 {
350 my_buf = make_stringbuf (len);
351 memcpy (STRINGBUF_CHARS (my_buf),
352 STRINGBUF_CHARS (buf) + str_start + start, len);
353 }
354 else
355 {
356 my_buf = make_wide_stringbuf (len);
357 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
358 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
359 + start), len);
360 /* Even though this string is wide, the substring may be narrow.
50b1996f 361 Consider adding code to narrow the string. */
9c44cd45 362 }
3ee86942 363 scm_remember_upto_here_1 (buf);
9c44cd45
MG
364 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
365 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 366}
0f2d19dd 367
e1b29f6a
MV
368SCM
369scm_i_substring_shared (SCM str, size_t start, size_t end)
370{
371 if (start == 0 && end == STRING_LENGTH (str))
372 return str;
373 else
374 {
375 size_t len = end - start;
376 if (IS_SH_STRING (str))
377 {
378 start += STRING_START (str);
379 str = SH_STRING_STRING (str);
380 }
381 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
382 (scm_t_bits)start, (scm_t_bits) len);
383 }
384}
385
3ee86942
MV
386SCM
387scm_c_substring (SCM str, size_t start, size_t end)
388{
389 validate_substring_args (str, start, end);
390 return scm_i_substring (str, start, end);
391}
ee149d03 392
ed35de72
MV
393SCM
394scm_c_substring_read_only (SCM str, size_t start, size_t end)
395{
396 validate_substring_args (str, start, end);
397 return scm_i_substring_read_only (str, start, end);
398}
399
0f2d19dd 400SCM
3ee86942 401scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 402{
3ee86942
MV
403 validate_substring_args (str, start, end);
404 return scm_i_substring_copy (str, start, end);
405}
406
3ee86942
MV
407SCM
408scm_c_substring_shared (SCM str, size_t start, size_t end)
409{
410 validate_substring_args (str, start, end);
411 return scm_i_substring_shared (str, start, end);
412}
0f2d19dd 413
d6c74168 414\f
3ee86942
MV
415/* Internal accessors
416 */
417
50b1996f
MG
418/* Returns the number of characters in STR. This may be different
419 than the memory size of the string storage. */
3ee86942
MV
420size_t
421scm_i_string_length (SCM str)
0f2d19dd 422{
3ee86942 423 return STRING_LENGTH (str);
0f2d19dd
JB
424}
425
50b1996f
MG
426/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
427 encoding. False if it is 'wide', having a 32-bit UCS-4
428 encoding. */
9c44cd45
MG
429int
430scm_i_is_narrow_string (SCM str)
431{
432 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
433}
434
587a3355
MG
435/* Try to coerce a string to be narrow. It if is narrow already, do
436 nothing. If it is wide, shrink it to narrow if none of its
437 characters are above 0xFF. Return true if the string is narrow or
438 was made to be narrow. */
439int
440scm_i_try_narrow_string (SCM str)
441{
ba54a202 442 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
443
444 return scm_i_is_narrow_string (str);
445}
446
50b1996f
MG
447/* Returns a pointer to the 8-bit Latin-1 encoded character array of
448 STR. */
3ee86942
MV
449const char *
450scm_i_string_chars (SCM str)
451{
452 SCM buf;
e1b29f6a
MV
453 size_t start;
454 get_str_buf_start (&str, &buf, &start);
9c44cd45 455 if (scm_i_is_narrow_string (str))
f59cf998 456 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
457 else
458 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
459 scm_list_1 (str));
460 return NULL;
3ee86942 461}
b00418df 462
50b1996f
MG
463/* Returns a pointer to the 32-bit UCS-4 encoded character array of
464 STR. */
9c44cd45
MG
465const scm_t_wchar *
466scm_i_string_wide_chars (SCM str)
467{
468 SCM buf;
469 size_t start;
470
471 get_str_buf_start (&str, &buf, &start);
472 if (!scm_i_is_narrow_string (str))
f59cf998 473 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
474 else
475 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
476 scm_list_1 (str));
477}
478
479/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
480 a new string buffer, so that it can be modified without modifying
50b1996f
MG
481 other strings. Also, lock the string mutex. Later, one must call
482 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
483SCM
484scm_i_string_start_writing (SCM orig_str)
b00418df 485{
ed35de72 486 SCM buf, str = orig_str;
e1b29f6a 487 size_t start;
ed35de72 488
e1b29f6a 489 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
490 if (IS_RO_STRING (str))
491 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
492
9de87eea 493 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
494 if (STRINGBUF_SHARED (buf))
495 {
9c44cd45 496 /* Clone the stringbuf. */
3ee86942
MV
497 size_t len = STRING_LENGTH (str);
498 SCM new_buf;
499
9de87eea 500 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 501
9c44cd45
MG
502 if (scm_i_is_narrow_string (str))
503 {
504 new_buf = make_stringbuf (len);
505 memcpy (STRINGBUF_CHARS (new_buf),
506 STRINGBUF_CHARS (buf) + STRING_START (str), len);
507
508 }
509 else
510 {
511 new_buf = make_wide_stringbuf (len);
512 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
513 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
514 + STRING_START (str)), len);
515 }
3ee86942 516
3ee86942
MV
517 SET_STRING_STRINGBUF (str, new_buf);
518 start -= STRING_START (str);
902578f1
LC
519
520 /* FIXME: The following operations are not atomic, so other threads
521 looking at STR may see an inconsistent state. Nevertheless it can't
522 hurt much since (i) accessing STR while it is being mutated can't
523 yield a crash, and (ii) concurrent accesses to STR should be
524 protected by a mutex at the application level. The latter may not
525 apply when STR != ORIG_STR, though. */
3ee86942 526 SET_STRING_START (str, 0);
902578f1 527 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
528
529 buf = new_buf;
530
9de87eea 531 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 532 }
9c44cd45
MG
533 return orig_str;
534}
3ee86942 535
50b1996f 536/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
537char *
538scm_i_string_writable_chars (SCM str)
539{
540 SCM buf;
541 size_t start;
3ee86942 542
9c44cd45
MG
543 get_str_buf_start (&str, &buf, &start);
544 if (scm_i_is_narrow_string (str))
f59cf998 545 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
546 else
547 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
548 scm_list_1 (str));
549 return NULL;
b00418df
DH
550}
551
50b1996f 552/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
553static scm_t_wchar *
554scm_i_string_writable_wide_chars (SCM str)
555{
556 SCM buf;
557 size_t start;
558
559 get_str_buf_start (&str, &buf, &start);
560 if (!scm_i_is_narrow_string (str))
561 return STRINGBUF_WIDE_CHARS (buf) + start;
562 else
1c7b216f 563 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 564 scm_list_1 (str));
b00418df
DH
565}
566
50b1996f
MG
567/* Unlock the string mutex that was locked when
568 scm_i_string_start_writing was called. */
3ee86942
MV
569void
570scm_i_string_stop_writing (void)
571{
9de87eea 572 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 573}
b00418df 574
50b1996f 575/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
576scm_t_wchar
577scm_i_string_ref (SCM str, size_t x)
578{
579 if (scm_i_is_narrow_string (str))
580 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
581 else
582 return scm_i_string_wide_chars (str)[x];
583}
584
889975e5
MG
585/* Returns index+1 of the first char in STR that matches C, or
586 0 if the char is not found. */
587int
588scm_i_string_contains_char (SCM str, char ch)
589{
590 size_t i;
591 size_t len = scm_i_string_length (str);
592
593 i = 0;
594 if (scm_i_is_narrow_string (str))
595 {
596 while (i < len)
597 {
598 if (scm_i_string_chars (str)[i] == ch)
599 return i+1;
600 i++;
601 }
602 }
603 else
604 {
605 while (i < len)
606 {
607 if (scm_i_string_wide_chars (str)[i]
608 == (unsigned char) ch)
609 return i+1;
610 i++;
611 }
612 }
613 return 0;
614}
615
3f47e526
MG
616int
617scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
618{
619 if (scm_i_is_narrow_string (sstr))
620 {
621 const char *a = scm_i_string_chars (sstr) + start_x;
622 const char *b = cstr;
623 return strncmp (a, b, strlen(b));
624 }
625 else
626 {
627 size_t i;
628 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
629 const char *b = cstr;
630 for (i = 0; i < strlen (b); i++)
631 {
632 if (a[i] != (unsigned char) b[i])
633 return 1;
634 }
635 }
636 return 0;
637}
638
50b1996f 639/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
640void
641scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
642{
643 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 644 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
645
646 if (scm_i_is_narrow_string (str))
647 {
648 char *dst = scm_i_string_writable_chars (str);
587a3355 649 dst[p] = chr;
9c44cd45
MG
650 }
651 else
652 {
653 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
654 dst[p] = chr;
655 }
656}
657
ba54a202 658\f
3ee86942 659/* Symbols.
587a3355 660
3ee86942
MV
661 Basic symbol creation and accessing is done here, the rest is in
662 symbols.[hc]. This has been done to keep stringbufs and the
663 internals of strings and string-like objects confined to this file.
664*/
665
666#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
667
668SCM
6869328b
MV
669scm_i_make_symbol (SCM name, scm_t_bits flags,
670 unsigned long hash, SCM props)
3ee86942
MV
671{
672 SCM buf;
673 size_t start = STRING_START (name);
674 size_t length = STRING_LENGTH (name);
675
676 if (IS_SH_STRING (name))
677 {
678 name = SH_STRING_STRING (name);
679 start += STRING_START (name);
680 }
681 buf = SYMBOL_STRINGBUF (name);
682
683 if (start == 0 && length == STRINGBUF_LENGTH (buf))
684 {
685 /* reuse buf. */
9de87eea 686 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 687 SET_STRINGBUF_SHARED (buf);
9de87eea 688 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
689 }
690 else
691 {
692 /* make new buf. */
9c44cd45
MG
693 if (scm_i_is_narrow_string (name))
694 {
695 SCM new_buf = make_stringbuf (length);
696 memcpy (STRINGBUF_CHARS (new_buf),
697 STRINGBUF_CHARS (buf) + start, length);
698 buf = new_buf;
699 }
700 else
701 {
702 SCM new_buf = make_wide_stringbuf (length);
703 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
704 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
705 length);
706 buf = new_buf;
707 }
3ee86942 708 }
6869328b 709 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
710 (scm_t_bits) hash, SCM_UNPACK (props));
711}
712
fd0a5bbc
HWN
713SCM
714scm_i_c_make_symbol (const char *name, size_t len,
715 scm_t_bits flags, unsigned long hash, SCM props)
716{
717 SCM buf = make_stringbuf (len);
718 memcpy (STRINGBUF_CHARS (buf), name, len);
719
a284cc7e
LC
720 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
721 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
722}
723
50b1996f
MG
724/* Returns the number of characters in SYM. This may be different
725 from the memory size of SYM. */
3ee86942
MV
726size_t
727scm_i_symbol_length (SCM sym)
0f2d19dd 728{
3ee86942 729 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
730}
731
071bb6a8
LC
732size_t
733scm_c_symbol_length (SCM sym)
734#define FUNC_NAME "scm_c_symbol_length"
735{
736 SCM_VALIDATE_SYMBOL (1, sym);
737
738 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
739}
740#undef FUNC_NAME
741
50b1996f
MG
742/* True if the name of SYM is stored as a Latin-1 encoded string.
743 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
744int
745scm_i_is_narrow_symbol (SCM sym)
746{
747 SCM buf;
748
749 buf = SYMBOL_STRINGBUF (sym);
750 return !STRINGBUF_WIDE (buf);
751}
752
50b1996f
MG
753/* Returns a pointer to the 8-bit Latin-1 encoded character array that
754 contains the name of SYM. */
3ee86942
MV
755const char *
756scm_i_symbol_chars (SCM sym)
757{
9c44cd45
MG
758 SCM buf;
759
760 buf = SYMBOL_STRINGBUF (sym);
761 if (!STRINGBUF_WIDE (buf))
f59cf998 762 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
763 else
764 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
765 scm_list_1 (sym));
766}
767
50b1996f
MG
768/* Return a pointer to the 32-bit UCS-4-encoded character array of a
769 symbol's name. */
9c44cd45
MG
770const scm_t_wchar *
771scm_i_symbol_wide_chars (SCM sym)
772{
773 SCM buf;
774
775 buf = SYMBOL_STRINGBUF (sym);
776 if (STRINGBUF_WIDE (buf))
f59cf998 777 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
778 else
779 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
780 scm_list_1 (sym));
3ee86942 781}
1cc91f1b 782
be54b15d 783SCM
3ee86942 784scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 785{
3ee86942 786 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 787 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 788 SET_STRINGBUF_SHARED (buf);
9de87eea 789 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 790 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
791 (scm_t_bits)start, (scm_t_bits) end - start);
792}
be54b15d 793
50b1996f 794/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
795scm_t_wchar
796scm_i_symbol_ref (SCM sym, size_t x)
797{
798 if (scm_i_is_narrow_symbol (sym))
799 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
800 else
801 return scm_i_symbol_wide_chars (sym)[x];
802}
803
3ee86942
MV
804/* Debugging
805 */
be54b15d 806
6ce6923b
MG
807SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
808 "Returns an association list containing debugging information\n"
809 "for @var{str}. The association list has the following entries."
810 "@table @code\n"
811 "@item string\n"
812 "The string itself.\n"
813 "@item start\n"
814 "The start index of the string into its stringbuf\n"
815 "@item length\n"
816 "The length of the string\n"
817 "@item shared\n"
818 "If this string is a substring, it returns its parent string.\n"
819 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
820 "@item read-only\n"
821 "@code{#t} if the string is read-only\n"
6ce6923b
MG
822 "@item stringbuf-chars\n"
823 "A new string containing this string's stringbuf's characters\n"
824 "@item stringbuf-length\n"
825 "The number of characters in this stringbuf\n"
826 "@item stringbuf-shared\n"
827 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
828 "@item stringbuf-wide\n"
829 "@code{#t} if this stringbuf's characters are stored in a\n"
830 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
831 "buffer\n"
832 "@end table")
3ee86942
MV
833#define FUNC_NAME s_scm_sys_string_dump
834{
ba54a202 835 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 836 SCM buf;
3ee86942 837 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
838
839 /* String info */
840 e1 = scm_cons (scm_from_locale_symbol ("string"),
841 str);
842 e2 = scm_cons (scm_from_locale_symbol ("start"),
843 scm_from_size_t (STRING_START (str)));
844 e3 = scm_cons (scm_from_locale_symbol ("length"),
845 scm_from_size_t (STRING_LENGTH (str)));
846
3ee86942
MV
847 if (IS_SH_STRING (str))
848 {
6ce6923b
MG
849 e4 = scm_cons (scm_from_locale_symbol ("shared"),
850 SH_STRING_STRING (str));
851 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
852 }
853 else
854 {
6ce6923b
MG
855 e4 = scm_cons (scm_from_locale_symbol ("shared"),
856 SCM_BOOL_F);
857 buf = STRING_STRINGBUF (str);
3ee86942 858 }
9c44cd45 859
88ed5759
MG
860 if (IS_RO_STRING (str))
861 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
862 SCM_BOOL_T);
863 else
864 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
865 SCM_BOOL_F);
587a3355 866
6ce6923b 867 /* Stringbuf info */
6ce6923b
MG
868 if (!STRINGBUF_WIDE (buf))
869 {
870 size_t len = STRINGBUF_LENGTH (buf);
871 char *cbuf;
872 SCM sbc = scm_i_make_string (len, &cbuf);
873 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
874 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
875 sbc);
3ee86942 876 }
6ce6923b
MG
877 else
878 {
879 size_t len = STRINGBUF_LENGTH (buf);
880 scm_t_wchar *cbuf;
881 SCM sbc = scm_i_make_wide_string (len, &cbuf);
882 u32_cpy ((scm_t_uint32 *) cbuf,
883 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
884 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
885 sbc);
886 }
887 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
888 scm_from_size_t (STRINGBUF_LENGTH (buf)));
889 if (STRINGBUF_SHARED (buf))
890 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
891 SCM_BOOL_T);
892 else
893 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
894 SCM_BOOL_F);
6ce6923b 895 if (STRINGBUF_WIDE (buf))
ba54a202
LC
896 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
897 SCM_BOOL_T);
6ce6923b 898 else
ba54a202
LC
899 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
900 SCM_BOOL_F);
6ce6923b 901
ba54a202 902 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
903}
904#undef FUNC_NAME
905
6ce6923b
MG
906SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
907 "Returns an association list containing debugging information\n"
908 "for @var{sym}. The association list has the following entries."
909 "@table @code\n"
910 "@item symbol\n"
911 "The symbol itself\n"
912 "@item hash\n"
913 "Its hash value\n"
88ed5759
MG
914 "@item interned\n"
915 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
916 "@item stringbuf-chars\n"
917 "A new string containing this symbols's stringbuf's characters\n"
918 "@item stringbuf-length\n"
919 "The number of characters in this stringbuf\n"
920 "@item stringbuf-shared\n"
921 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
922 "@item stringbuf-wide\n"
923 "@code{#t} if this stringbuf's characters are stored in a\n"
924 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
925 "buffer\n"
926 "@end table")
3ee86942
MV
927#define FUNC_NAME s_scm_sys_symbol_dump
928{
ba54a202 929 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 930 SCM buf;
3ee86942 931 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
932 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
933 sym);
934 e2 = scm_cons (scm_from_locale_symbol ("hash"),
935 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
936 e3 = scm_cons (scm_from_locale_symbol ("interned"),
937 scm_symbol_interned_p (sym));
6ce6923b
MG
938 buf = SYMBOL_STRINGBUF (sym);
939
940 /* Stringbuf info */
6ce6923b
MG
941 if (!STRINGBUF_WIDE (buf))
942 {
943 size_t len = STRINGBUF_LENGTH (buf);
944 char *cbuf;
945 SCM sbc = scm_i_make_string (len, &cbuf);
946 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
947 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
948 sbc);
949 }
9c44cd45 950 else
6ce6923b
MG
951 {
952 size_t len = STRINGBUF_LENGTH (buf);
953 scm_t_wchar *cbuf;
954 SCM sbc = scm_i_make_wide_string (len, &cbuf);
955 u32_cpy ((scm_t_uint32 *) cbuf,
956 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
957 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
958 sbc);
959 }
960 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
961 scm_from_size_t (STRINGBUF_LENGTH (buf)));
962 if (STRINGBUF_SHARED (buf))
963 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
964 SCM_BOOL_T);
965 else
966 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
967 SCM_BOOL_F);
6ce6923b 968 if (STRINGBUF_WIDE (buf))
ba54a202 969 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b
MG
970 SCM_BOOL_T);
971 else
ba54a202 972 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b 973 SCM_BOOL_F);
ba54a202 974 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 975
3ee86942
MV
976}
977#undef FUNC_NAME
978
56a3dcd4 979#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 980
9c44cd45 981SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 982#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
983{
984 int i;
985 for (i = 0; i < 1000; i++)
986 if (lenhist[i])
987 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
988 fprintf (stderr, ">999: %u\n", lenhist[1000]);
989 return SCM_UNSPECIFIED;
be54b15d
DH
990}
991#undef FUNC_NAME
992
3ee86942
MV
993#endif
994
995\f
996
997SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
998 (SCM obj),
999 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1000#define FUNC_NAME s_scm_string_p
1001{
1002 return scm_from_bool (IS_STRING (obj));
1003}
1004#undef FUNC_NAME
1005
1006
1007SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1008
1009SCM_DEFINE (scm_string, "string", 0, 0, 1,
1010 (SCM chrs),
1011 "@deffnx {Scheme Procedure} list->string chrs\n"
1012 "Return a newly allocated string composed of the arguments,\n"
1013 "@var{chrs}.")
1014#define FUNC_NAME s_scm_string
1015{
9aa27c1a 1016 SCM result = SCM_BOOL_F;
9c44cd45 1017 SCM rest;
3ee86942 1018 size_t len;
9c44cd45
MG
1019 size_t p = 0;
1020 long i;
9aa27c1a 1021 int wide = 0;
3ee86942 1022
9c44cd45
MG
1023 /* Verify that this is a list of chars. */
1024 i = scm_ilength (chrs);
3c7cf7f5 1025 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1026
9c44cd45
MG
1027 len = (size_t) i;
1028 rest = chrs;
3ee86942 1029
9c44cd45 1030 while (len > 0 && scm_is_pair (rest))
3ee86942 1031 {
9c44cd45 1032 SCM elt = SCM_CAR (rest);
3ee86942 1033 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1034 if (SCM_CHAR (elt) > 0xFF)
1035 wide = 1;
9c44cd45
MG
1036 rest = SCM_CDR (rest);
1037 len--;
1038 scm_remember_upto_here_1 (elt);
1039 }
1040
1041 /* Construct a string containing this list of chars. */
1042 len = (size_t) i;
1043 rest = chrs;
1044
9aa27c1a 1045 if (wide == 0)
9c44cd45 1046 {
56a3dcd4
LC
1047 char *buf;
1048
9aa27c1a
MG
1049 result = scm_i_make_string (len, NULL);
1050 result = scm_i_string_start_writing (result);
56a3dcd4 1051 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1052 while (len > 0 && scm_is_pair (rest))
1053 {
1054 SCM elt = SCM_CAR (rest);
1055 buf[p] = (unsigned char) SCM_CHAR (elt);
1056 p++;
1057 rest = SCM_CDR (rest);
1058 len--;
1059 scm_remember_upto_here_1 (elt);
1060 }
1061 }
1062 else
1063 {
56a3dcd4
LC
1064 scm_t_wchar *buf;
1065
9aa27c1a
MG
1066 result = scm_i_make_wide_string (len, NULL);
1067 result = scm_i_string_start_writing (result);
56a3dcd4 1068 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1069 while (len > 0 && scm_is_pair (rest))
1070 {
1071 SCM elt = SCM_CAR (rest);
1072 buf[p] = SCM_CHAR (elt);
1073 p++;
1074 rest = SCM_CDR (rest);
1075 len--;
1076 scm_remember_upto_here_1 (elt);
1077 }
3ee86942 1078 }
9c44cd45
MG
1079 scm_i_string_stop_writing ();
1080
3ee86942
MV
1081 if (len > 0)
1082 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1083 if (!scm_is_null (rest))
3ee86942
MV
1084 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1085
1086 return result;
1087}
1088#undef FUNC_NAME
be54b15d 1089
3b3b36dd 1090SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1091 (SCM k, SCM chr),
0d26a824
MG
1092 "Return a newly allocated string of\n"
1093 "length @var{k}. If @var{chr} is given, then all elements of\n"
1094 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1095 "of the @var{string} are unspecified.")
1bbd0b84 1096#define FUNC_NAME s_scm_make_string
0f2d19dd 1097{
3ee86942
MV
1098 return scm_c_make_string (scm_to_size_t (k), chr);
1099}
1100#undef FUNC_NAME
1101
1102SCM
1103scm_c_make_string (size_t len, SCM chr)
1104#define FUNC_NAME NULL
1105{
9c44cd45
MG
1106 size_t p;
1107 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1108
e11e83f3
MV
1109 if (!SCM_UNBNDP (chr))
1110 {
3ee86942 1111 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1112 res = scm_i_string_start_writing (res);
1113 for (p = 0; p < len; p++)
1114 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1115 scm_i_string_stop_writing ();
0f2d19dd 1116 }
e11e83f3
MV
1117
1118 return res;
0f2d19dd 1119}
1bbd0b84 1120#undef FUNC_NAME
0f2d19dd 1121
3b3b36dd 1122SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1123 (SCM string),
1124 "Return the number of characters in @var{string}.")
1bbd0b84 1125#define FUNC_NAME s_scm_string_length
0f2d19dd 1126{
d1ca2c64 1127 SCM_VALIDATE_STRING (1, string);
3ee86942 1128 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1129}
1bbd0b84 1130#undef FUNC_NAME
0f2d19dd 1131
f8ba2bb9 1132SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1133 (SCM string),
1134 "Return the bytes used to represent a character in @var{string}."
1135 "This will return 1 or 4.")
f8ba2bb9 1136#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1137{
1138 SCM_VALIDATE_STRING (1, string);
1139 if (!scm_i_is_narrow_string (string))
1140 return scm_from_int (4);
1141
1142 return scm_from_int (1);
1143}
1144#undef FUNC_NAME
1145
3ee86942
MV
1146size_t
1147scm_c_string_length (SCM string)
1148{
1149 if (!IS_STRING (string))
1150 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1151 return STRING_LENGTH (string);
1152}
1153
bd9e24b3 1154SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1155 (SCM str, SCM k),
9c44cd45
MG
1156 "Return character @var{k} of @var{str} using zero-origin\n"
1157 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1158#define FUNC_NAME s_scm_string_ref
0f2d19dd 1159{
3ae3166b 1160 size_t len;
a55c2b68 1161 unsigned long idx;
bd9e24b3 1162
d1ca2c64 1163 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1164
1165 len = scm_i_string_length (str);
1166 if (SCM_LIKELY (len > 0))
1167 idx = scm_to_unsigned_integer (k, 0, len - 1);
1168 else
1169 scm_out_of_range (NULL, k);
1170
9c44cd45
MG
1171 if (scm_i_is_narrow_string (str))
1172 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1173 else
1174 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1175}
1bbd0b84 1176#undef FUNC_NAME
0f2d19dd 1177
3ee86942
MV
1178SCM
1179scm_c_string_ref (SCM str, size_t p)
1180{
1181 if (p >= scm_i_string_length (str))
1182 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1183 if (scm_i_is_narrow_string (str))
1184 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1185 else
1186 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1187
3ee86942 1188}
f0942910 1189
3b3b36dd 1190SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1191 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1192 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1193 "an unspecified value. @var{k} must be a valid index of\n"
1194 "@var{str}.")
1bbd0b84 1195#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1196{
3ae3166b 1197 size_t len;
a55c2b68
MV
1198 unsigned long idx;
1199
f0942910 1200 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1201
1202 len = scm_i_string_length (str);
1203 if (SCM_LIKELY (len > 0))
1204 idx = scm_to_unsigned_integer (k, 0, len - 1);
1205 else
1206 scm_out_of_range (NULL, k);
1207
34d19ef6 1208 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1209 str = scm_i_string_start_writing (str);
1210 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1211 scm_i_string_stop_writing ();
1212
0f2d19dd
JB
1213 return SCM_UNSPECIFIED;
1214}
1bbd0b84 1215#undef FUNC_NAME
0f2d19dd 1216
3ee86942
MV
1217void
1218scm_c_string_set_x (SCM str, size_t p, SCM chr)
1219{
1220 if (p >= scm_i_string_length (str))
1221 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1222 str = scm_i_string_start_writing (str);
1223 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1224 scm_i_string_stop_writing ();
3ee86942 1225}
0f2d19dd 1226
3b3b36dd 1227SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1228 (SCM str, SCM start, SCM end),
1229 "Return a newly allocated string formed from the characters\n"
1230 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1231 "ending with index @var{end} (exclusive).\n"
1232 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1233 "exact integers satisfying:\n\n"
1234 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1235#define FUNC_NAME s_scm_substring
0f2d19dd 1236{
3ee86942 1237 size_t len, from, to;
685c0d71 1238
d1ca2c64 1239 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1240 len = scm_i_string_length (str);
1241 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1242 if (SCM_UNBNDP (end))
3ee86942 1243 to = len;
a55c2b68 1244 else
3ee86942
MV
1245 to = scm_to_unsigned_integer (end, from, len);
1246 return scm_i_substring (str, from, to);
0f2d19dd 1247}
1bbd0b84 1248#undef FUNC_NAME
0f2d19dd 1249
ed35de72
MV
1250SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1251 (SCM str, SCM start, SCM end),
1252 "Return a newly allocated string formed from the characters\n"
1253 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1254 "ending with index @var{end} (exclusive).\n"
1255 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1256 "exact integers satisfying:\n"
1257 "\n"
1258 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1259 "\n"
1260 "The returned string is read-only.\n")
1261#define FUNC_NAME s_scm_substring_read_only
1262{
1263 size_t len, from, to;
1264
1265 SCM_VALIDATE_STRING (1, str);
1266 len = scm_i_string_length (str);
1267 from = scm_to_unsigned_integer (start, 0, len);
1268 if (SCM_UNBNDP (end))
1269 to = len;
1270 else
1271 to = scm_to_unsigned_integer (end, from, len);
1272 return scm_i_substring_read_only (str, from, to);
1273}
1274#undef FUNC_NAME
1275
3ee86942
MV
1276SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1277 (SCM str, SCM start, SCM end),
1278 "Return a newly allocated string formed from the characters\n"
1279 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1280 "ending with index @var{end} (exclusive).\n"
1281 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1282 "exact integers satisfying:\n\n"
1283 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1284#define FUNC_NAME s_scm_substring_copy
1285{
e1b29f6a
MV
1286 /* For the Scheme version, START is mandatory, but for the C
1287 version, it is optional. See scm_string_copy in srfi-13.c for a
1288 rationale.
1289 */
1290
1291 size_t from, to;
3ee86942
MV
1292
1293 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1294 scm_i_get_substring_spec (scm_i_string_length (str),
1295 start, &from, end, &to);
3ee86942
MV
1296 return scm_i_substring_copy (str, from, to);
1297}
1298#undef FUNC_NAME
1299
1300SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1301 (SCM str, SCM start, SCM end),
1302 "Return string that indirectly refers to the characters\n"
1303 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1304 "ending with index @var{end} (exclusive).\n"
1305 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1306 "exact integers satisfying:\n\n"
1307 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1308#define FUNC_NAME s_scm_substring_shared
1309{
1310 size_t len, from, to;
1311
1312 SCM_VALIDATE_STRING (1, str);
1313 len = scm_i_string_length (str);
1314 from = scm_to_unsigned_integer (start, 0, len);
1315 if (SCM_UNBNDP (end))
1316 to = len;
1317 else
1318 to = scm_to_unsigned_integer (end, from, len);
1319 return scm_i_substring_shared (str, from, to);
1320}
1321#undef FUNC_NAME
685c0d71 1322
3b3b36dd 1323SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1324 (SCM args),
9c44cd45 1325 "Return a newly allocated string whose characters form the\n"
0d26a824 1326 "concatenation of the given strings, @var{args}.")
1bbd0b84 1327#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1328{
1329 SCM res;
9c44cd45
MG
1330 size_t len = 0;
1331 int wide = 0;
c829a427 1332 SCM l, s;
bd4911ef 1333 size_t i;
9909c395
MG
1334 union
1335 {
1336 char *narrow;
1337 scm_t_wchar *wide;
1338 } data;
af45e3b0
DH
1339
1340 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1341 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1342 {
1343 s = SCM_CAR (l);
1344 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1345 len += scm_i_string_length (s);
1346 if (!scm_i_is_narrow_string (s))
1347 wide = 1;
c829a427 1348 }
9909c395 1349 data.narrow = NULL;
9c44cd45 1350 if (!wide)
9909c395 1351 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1352 else
9909c395 1353 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1354
1355 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1356 {
edea856c 1357 size_t len;
c829a427 1358 s = SCM_CAR (l);
3ee86942 1359 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1360 len = scm_i_string_length (s);
9c44cd45
MG
1361 if (!wide)
1362 {
9909c395
MG
1363 memcpy (data.narrow, scm_i_string_chars (s), len);
1364 data.narrow += len;
9c44cd45
MG
1365 }
1366 else
1367 {
1368 if (scm_i_is_narrow_string (s))
1369 {
1370 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1371 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1372 }
1373 else
9909c395 1374 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1375 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1376 data.wide += len;
9c44cd45 1377 }
c829a427
MV
1378 scm_remember_upto_here_1 (s);
1379 }
0f2d19dd
JB
1380 return res;
1381}
1bbd0b84 1382#undef FUNC_NAME
0f2d19dd 1383
c829a427
MV
1384int
1385scm_is_string (SCM obj)
1386{
3ee86942 1387 return IS_STRING (obj);
c829a427 1388}
24933780 1389
a3d7d5d5
LC
1390\f
1391/* Conversion to/from other encodings. */
1392
1393SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1394static void
1395scm_encoding_error (const char *subr, const char *message, SCM args)
1396{
1397 scm_error (scm_encoding_error_key, subr, message, args, SCM_BOOL_F);
1398}
1399
fac32b51 1400SCM
587a3355
MG
1401scm_from_stringn (const char *str, size_t len, const char *encoding,
1402 scm_t_string_failed_conversion_handler handler)
1403{
1404 size_t u32len, i;
1405 scm_t_wchar *u32;
1406 int wide = 0;
1407 SCM res;
1408
fac32b51
MG
1409 if (len == 0)
1410 return scm_nullstr;
1411
889975e5
MG
1412 if (encoding == NULL)
1413 {
1414 /* If encoding is null, use Latin-1. */
1415 char *buf;
1416 res = scm_i_make_string (len, &buf);
1417 memcpy (buf, str, len);
1418 return res;
1419 }
1420
587a3355
MG
1421 u32len = 0;
1422 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1423 (enum iconv_ilseq_handler)
1424 handler,
1425 str, len,
1426 NULL,
1427 NULL, &u32len);
1428
1429 if (u32 == NULL)
1430 {
1431 if (errno == ENOMEM)
1432 scm_memory_error ("locale string conversion");
1433 else
1434 {
889975e5 1435 /* There are invalid sequences in the input string. */
587a3355
MG
1436 SCM errstr;
1437 char *dst;
587a3355
MG
1438 errstr = scm_i_make_string (len, &dst);
1439 memcpy (dst, str, len);
a3d7d5d5
LC
1440 scm_encoding_error (NULL,
1441 "input locale conversion error from ~s: ~s",
1442 scm_list_2 (scm_from_locale_string (encoding),
1443 errstr));
587a3355
MG
1444 scm_remember_upto_here_1 (errstr);
1445 }
1446 }
1447
1448 i = 0;
1449 while (i < u32len)
1450 if (u32[i++] > 0xFF)
1451 {
1452 wide = 1;
1453 break;
1454 }
1455
1456 if (!wide)
1457 {
1458 char *dst;
1459 res = scm_i_make_string (u32len, &dst);
1460 for (i = 0; i < u32len; i ++)
1461 dst[i] = (unsigned char) u32[i];
1462 dst[u32len] = '\0';
1463 }
1464 else
1465 {
1466 scm_t_wchar *wdst;
1467 res = scm_i_make_wide_string (u32len, &wdst);
1468 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1469 wdst[u32len] = 0;
1470 }
1471
1472 free (u32);
1473 return res;
1474}
1475
c829a427
MV
1476SCM
1477scm_from_locale_stringn (const char *str, size_t len)
1478{
889975e5
MG
1479 const char *enc;
1480 scm_t_string_failed_conversion_handler hndl;
1481 SCM inport;
1482 scm_t_port *pt;
4d4528e7 1483
9c44cd45 1484 if (len == (size_t) -1)
c829a427 1485 len = strlen (str);
9c44cd45
MG
1486 if (len == 0)
1487 return scm_nullstr;
1488
889975e5
MG
1489 inport = scm_current_input_port ();
1490 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1491 {
1492 pt = SCM_PTAB_ENTRY (inport);
1493 enc = pt->encoding;
1494 hndl = pt->ilseq_handler;
1495 }
1496 else
1497 {
1498 enc = NULL;
1499 hndl = SCM_FAILED_CONVERSION_ERROR;
1500 }
1501
1502 return scm_from_stringn (str, len, enc, hndl);
c829a427 1503}
4d4528e7 1504
c829a427
MV
1505SCM
1506scm_from_locale_string (const char *str)
4d4528e7 1507{
9c44cd45
MG
1508 if (str == NULL)
1509 return scm_nullstr;
1510
c829a427
MV
1511 return scm_from_locale_stringn (str, -1);
1512}
4d4528e7 1513
587a3355
MG
1514SCM
1515scm_i_from_utf8_string (const scm_t_uint8 *str)
1516{
1517 return scm_from_stringn ((const char *) str,
1518 strlen ((char *) str), "UTF-8",
1519 SCM_FAILED_CONVERSION_ERROR);
1520}
1521
50b1996f
MG
1522/* Create a new scheme string from the C string STR. The memory of
1523 STR may be used directly as storage for the new string. */
13a94556
LC
1524/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1525 would be to register a finalizer to eventually free(3) STR, which isn't
1526 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1527SCM
1528scm_take_locale_stringn (char *str, size_t len)
1529{
13a94556 1530 SCM res;
48ddf0d9 1531
13a94556
LC
1532 res = scm_from_locale_stringn (str, len);
1533 free (str);
c829a427 1534
c829a427
MV
1535 return res;
1536}
1537
48ddf0d9
KR
1538SCM
1539scm_take_locale_string (char *str)
1540{
1541 return scm_take_locale_stringn (str, -1);
1542}
1543
9c44cd45
MG
1544/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1545 and \UXXXXXX. */
1546static void
1547unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1548{
1549 char *before, *after;
1550 size_t i, j;
1551
1552 before = *bufp;
1553 after = *bufp;
1554 i = 0;
1555 j = 0;
1556 while (i < *lenp)
1557 {
1558 if ((i <= *lenp - 6)
1559 && before[i] == '\\'
1560 && before[i + 1] == 'u'
1561 && before[i + 2] == '0' && before[i + 3] == '0')
1562 {
1563 /* Convert \u00NN to \xNN */
1564 after[j] = '\\';
1565 after[j + 1] = 'x';
30a6b9ca
MG
1566 after[j + 2] = tolower ((int) before[i + 4]);
1567 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1568 i += 6;
1569 j += 4;
1570 }
1571 else if ((i <= *lenp - 10)
1572 && before[i] == '\\'
1573 && before[i + 1] == 'U'
1574 && before[i + 2] == '0' && before[i + 3] == '0')
1575 {
1576 /* Convert \U00NNNNNN to \UNNNNNN */
1577 after[j] = '\\';
1578 after[j + 1] = 'U';
30a6b9ca
MG
1579 after[j + 2] = tolower ((int) before[i + 4]);
1580 after[j + 3] = tolower ((int) before[i + 5]);
1581 after[j + 4] = tolower ((int) before[i + 6]);
1582 after[j + 5] = tolower ((int) before[i + 7]);
1583 after[j + 6] = tolower ((int) before[i + 8]);
1584 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1585 i += 10;
1586 j += 8;
1587 }
1588 else
1589 {
1590 after[j] = before[i];
1591 i++;
1592 j++;
1593 }
1594 }
1595 *lenp = j;
1596 after = scm_realloc (after, j);
1597}
1598
c829a427 1599char *
fac32b51 1600scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1601{
889975e5
MG
1602 SCM outport;
1603 scm_t_port *pt;
9c44cd45
MG
1604 const char *enc;
1605
889975e5
MG
1606 outport = scm_current_output_port ();
1607 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1608 {
1609 pt = SCM_PTAB_ENTRY (outport);
1610 enc = pt->encoding;
1611 }
1612 else
1613 enc = NULL;
9c44cd45 1614
889975e5
MG
1615 return scm_to_stringn (str, lenp,
1616 enc,
1617 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1618}
1619
29bcdbb0
LC
1620/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1621 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1622 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1623 defined by HANDLER. */
9c44cd45 1624char *
587a3355 1625scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1626 scm_t_string_failed_conversion_handler handler)
9c44cd45 1627{
9c44cd45
MG
1628 char *buf;
1629 size_t ilen, len, i;
889975e5
MG
1630 int ret;
1631 const char *enc;
4d4528e7 1632
3ee86942 1633 if (!scm_is_string (str))
c829a427 1634 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1635 ilen = scm_i_string_length (str);
1636
1637 if (ilen == 0)
1638 {
1639 buf = scm_malloc (1);
1640 buf[0] = '\0';
1641 if (lenp)
1642 *lenp = 0;
1643 return buf;
1644 }
587a3355 1645
c829a427 1646 if (lenp == NULL)
9c44cd45
MG
1647 for (i = 0; i < ilen; i++)
1648 if (scm_i_string_ref (str, i) == '\0')
1649 scm_misc_error (NULL,
1650 "string contains #\\nul character: ~S",
1651 scm_list_1 (str));
1652
889975e5 1653 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1654 {
889975e5
MG
1655 /* If using native Latin-1 encoding, just copy the string
1656 contents. */
9c44cd45
MG
1657 if (lenp)
1658 {
1659 buf = scm_malloc (ilen);
1660 memcpy (buf, scm_i_string_chars (str), ilen);
1661 *lenp = ilen;
1662 return buf;
1663 }
1664 else
1665 {
1666 buf = scm_malloc (ilen + 1);
1667 memcpy (buf, scm_i_string_chars (str), ilen);
1668 buf[ilen] = '\0';
1669 return buf;
1670 }
c829a427 1671 }
9c44cd45 1672
587a3355 1673
9c44cd45
MG
1674 buf = NULL;
1675 len = 0;
889975e5
MG
1676 enc = encoding;
1677 if (enc == NULL)
1678 enc = "ISO-8859-1";
1679 if (scm_i_is_narrow_string (str))
1680 {
1681 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1682 "ISO-8859-1", enc,
1683 (enum iconv_ilseq_handler) handler, NULL,
1684 &buf, &len);
9c44cd45 1685
889975e5
MG
1686 if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1687 unistring_escapes_to_guile_escapes (&buf, &len);
9c44cd45 1688
889975e5 1689 if (ret != 0)
a3d7d5d5
LC
1690 scm_encoding_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1691 scm_list_2 (scm_from_locale_string (enc), str));
889975e5
MG
1692 }
1693 else
1694 {
1695 buf = u32_conv_to_encoding (enc,
1696 (enum iconv_ilseq_handler) handler,
1697 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1698 ilen,
1699 NULL,
1700 NULL, &len);
1701 if (buf == NULL)
a3d7d5d5
LC
1702 scm_encoding_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1703 scm_list_2 (scm_from_locale_string (enc), str));
1704
5f5920e0
MG
1705 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1706 unistring_escapes_to_guile_escapes (&buf, &len);
889975e5 1707 }
9c44cd45 1708 if (lenp)
4d4528e7 1709 *lenp = len;
9c44cd45
MG
1710 else
1711 {
1712 buf = scm_realloc (buf, len + 1);
1713 buf[len] = '\0';
1714 }
24933780 1715
c829a427 1716 scm_remember_upto_here_1 (str);
9c44cd45 1717 return buf;
4d4528e7 1718}
af68e5e5 1719
c829a427
MV
1720char *
1721scm_to_locale_string (SCM str)
1722{
1723 return scm_to_locale_stringn (str, NULL);
1724}
af68e5e5 1725
587a3355
MG
1726scm_t_uint8 *
1727scm_i_to_utf8_string (SCM str)
1728{
1729 char *u8str;
1730 u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1731 return (scm_t_uint8 *) u8str;
1732}
1733
c829a427
MV
1734size_t
1735scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1736{
1737 size_t len;
9c44cd45 1738 char *result = NULL;
3ee86942 1739 if (!scm_is_string (str))
c829a427 1740 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1741 result = scm_to_locale_stringn (str, &len);
1742
1743 memcpy (buf, result, (len > max_len) ? max_len : len);
1744 free (result);
1745
c829a427
MV
1746 scm_remember_upto_here_1 (str);
1747 return len;
1748}
af68e5e5 1749
a3d7d5d5
LC
1750\f
1751/* Unicode string normalization. */
1752
edb7bb47
JG
1753/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1754 libguile/i18n.c. It would be useful to have this factored out into a more
1755 convenient location, but its use of alloca makes that tricky to do. */
1756
1757static SCM
1758normalize_str (SCM string, uninorm_t form)
1759{
1760 SCM ret;
1761 scm_t_uint32 *w_str;
1762 scm_t_wchar *cbuf;
1763 size_t rlen, len = scm_i_string_length (string);
1764
1765 if (scm_i_is_narrow_string (string))
1766 {
1767 size_t i;
1768 const char *buf = scm_i_string_chars (string);
1769
1770 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1771
1772 for (i = 0; i < len; i ++)
1773 w_str[i] = (unsigned char) buf[i];
1774 w_str[len] = 0;
1775 }
d8164b04
JG
1776 else
1777 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1778
edb7bb47
JG
1779 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1780
1781 ret = scm_i_make_wide_string (rlen, &cbuf);
1782 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1783 free (w_str);
d8164b04
JG
1784
1785 scm_i_try_narrow_string (ret);
1786
edb7bb47
JG
1787 return ret;
1788}
1789
1790SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1791 (SCM string),
1792 "Returns the NFC normalized form of @var{string}.")
1793#define FUNC_NAME s_scm_string_normalize_nfc
1794{
1795 SCM_VALIDATE_STRING (1, string);
1796 return normalize_str (string, UNINORM_NFC);
1797}
1798#undef FUNC_NAME
1799
1800SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1801 (SCM string),
1802 "Returns the NFD normalized form of @var{string}.")
1803#define FUNC_NAME s_scm_string_normalize_nfd
1804{
1805 SCM_VALIDATE_STRING (1, string);
1806 return normalize_str (string, UNINORM_NFD);
1807}
1808#undef FUNC_NAME
1809
1810SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1811 (SCM string),
1812 "Returns the NFKC normalized form of @var{string}.")
1813#define FUNC_NAME s_scm_string_normalize_nfkc
1814{
1815 SCM_VALIDATE_STRING (1, string);
1816 return normalize_str (string, UNINORM_NFKC);
1817}
1818#undef FUNC_NAME
1819
1820SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1821 (SCM string),
1822 "Returns the NFKD normalized form of @var{string}.")
1823#define FUNC_NAME s_scm_string_normalize_nfkd
1824{
1825 SCM_VALIDATE_STRING (1, string);
1826 return normalize_str (string, UNINORM_NFKD);
1827}
1828#undef FUNC_NAME
1829
3ee86942
MV
1830/* converts C scm_array of strings to SCM scm_list of strings. */
1831/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1832SCM
3ee86942
MV
1833scm_makfromstrs (int argc, char **argv)
1834{
1835 int i = argc;
1836 SCM lst = SCM_EOL;
1837 if (0 > i)
1838 for (i = 0; argv[i]; i++);
1839 while (i--)
1840 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1841 return lst;
1842}
1843
c829a427
MV
1844/* Return a newly allocated array of char pointers to each of the strings
1845 in args, with a terminating NULL pointer. */
1846
1847char **
1848scm_i_allocate_string_pointers (SCM list)
2a776823 1849#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 1850{
c829a427
MV
1851 char **result;
1852 int len = scm_ilength (list);
1853 int i;
1854
1855 if (len < 0)
1856 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1857
2a776823
LC
1858 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1859 "string pointers");
c829a427 1860 result[len] = NULL;
c829a427
MV
1861
1862 /* The list might be have been modified in another thread, so
1863 we check LIST before each access.
1864 */
d2e53ed6 1865 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427 1866 {
2a776823
LC
1867 SCM str;
1868 size_t len;
1869
1870 str = SCM_CAR (list);
1871 len = scm_c_string_length (str);
1872
1873 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1874 memcpy (result[i], scm_i_string_chars (str), len);
1875 result[i][len] = '\0';
1876
c829a427
MV
1877 list = SCM_CDR (list);
1878 }
1879
c829a427 1880 return result;
af68e5e5 1881}
2a776823 1882#undef FUNC_NAME
24933780 1883
6f14f578
MV
1884void
1885scm_i_get_substring_spec (size_t len,
1886 SCM start, size_t *cstart,
1887 SCM end, size_t *cend)
1888{
1889 if (SCM_UNBNDP (start))
1890 *cstart = 0;
1891 else
1892 *cstart = scm_to_unsigned_integer (start, 0, len);
1893
1894 if (SCM_UNBNDP (end))
1895 *cend = len;
1896 else
1897 *cend = scm_to_unsigned_integer (end, *cstart, len);
1898}
1899
3ee86942
MV
1900#if SCM_ENABLE_DEPRECATED
1901
556d75db
MV
1902/* When these definitions are removed, it becomes reasonable to use
1903 read-only strings for string literals. For that, change the reader
1904 to create string literals with scm_c_substring_read_only instead of
1905 with scm_c_substring_copy.
1906*/
1907
3ee86942 1908int
fe78c51a 1909scm_i_deprecated_stringp (SCM str)
3ee86942
MV
1910{
1911 scm_c_issue_deprecation_warning
1912 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1913
2616f0e0 1914 return scm_is_string (str);
3ee86942
MV
1915}
1916
1917char *
fe78c51a 1918scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
1919{
1920 char *chars;
1921
1922 scm_c_issue_deprecation_warning
1923 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1924
2616f0e0
MV
1925 /* We don't accept shared substrings here since they are not
1926 null-terminated.
1927 */
1928 if (IS_SH_STRING (str))
c291b588
LC
1929 scm_misc_error (NULL,
1930 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
1931 SCM_EOL);
1932
877f06c3 1933 /* We explicitly test for read-only strings to produce a better
556d75db
MV
1934 error message.
1935 */
1936
1937 if (IS_RO_STRING (str))
c291b588
LC
1938 scm_misc_error (NULL,
1939 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 1940 SCM_EOL);
c291b588 1941
2616f0e0 1942 /* The following is still wrong, of course...
3ee86942 1943 */
9c44cd45 1944 str = scm_i_string_start_writing (str);
3ee86942
MV
1945 chars = scm_i_string_writable_chars (str);
1946 scm_i_string_stop_writing ();
1947 return chars;
1948}
1949
1950size_t
fe78c51a 1951scm_i_deprecated_string_length (SCM str)
3ee86942
MV
1952{
1953 scm_c_issue_deprecation_warning
1954 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1955 return scm_c_string_length (str);
1956}
1957
1958#endif
1959
2a610be5
AW
1960static SCM
1961string_handle_ref (scm_t_array_handle *h, size_t index)
1962{
1963 return scm_c_string_ref (h->array, index);
1964}
1965
1966static void
1967string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
1968{
1969 scm_c_string_set_x (h->array, index, val);
1970}
1971
1972static void
1973string_get_handle (SCM v, scm_t_array_handle *h)
1974{
1975 h->array = v;
1976 h->ndims = 1;
1977 h->dims = &h->dim0;
1978 h->dim0.lbnd = 0;
1979 h->dim0.ubnd = scm_c_string_length (v) - 1;
1980 h->dim0.inc = 1;
1981 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
1982 h->elements = h->writable_elements = NULL;
1983}
1984
c5f17102 1985SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 1986 string_handle_ref, string_handle_set,
f65e0168
LC
1987 string_get_handle)
1988SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 1989
0f2d19dd
JB
1990void
1991scm_init_strings ()
0f2d19dd 1992{
3ee86942 1993 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 1994
a0599745 1995#include "libguile/strings.x"
0f2d19dd
JB
1996}
1997
89e00824
ML
1998
1999/*
2000 Local Variables:
2001 c-file-style: "gnu"
2002 End:
2003*/