Merge commit 'd360671c1cca335600079f1c5714572d1c2e676d'
[bpt/guile.git] / libguile / strings.c
CommitLineData
8c76a897 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
f6f4feb0 32#include <c-strcase.h>
faf2c9d7 33
889975e5
MG
34#include "striconveh.h"
35
a0599745
MD
36#include "libguile/_scm.h"
37#include "libguile/chars.h"
7c33806a 38#include "libguile/root.h"
a0599745 39#include "libguile/strings.h"
f6f4feb0
MW
40#include "libguile/ports.h"
41#include "libguile/ports-internal.h"
a3d7d5d5 42#include "libguile/error.h"
f45eccff 43#include "libguile/generalized-vectors.h"
1afff620 44#include "libguile/deprecation.h"
a0599745 45#include "libguile/validate.h"
d31b9519 46#include "libguile/private-options.h"
1afff620 47
0f2d19dd
JB
48\f
49
50/* {Strings}
51 */
52
3ee86942
MV
53
54/* Stringbufs
55 *
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 58 * shared, not whether it definitely is.
3ee86942
MV
59 *
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
66 *
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
50b1996f 71 *
ba54a202
LC
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 * strings.
3ee86942
MV
75 */
76
ba54a202
LC
77/* The size in words of the stringbuf header (type tag + size). */
78#define STRINGBUF_HEADER_SIZE 2U
79
80#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
81
35920c00 82#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 83#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
84
85#define STRINGBUF_TAG scm_tc7_stringbuf
86#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 87#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 88
100e20c7 89#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
100e20c7
LC
92#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 94
100e20c7 95#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 96
9b41542f
LC
97#define SET_STRINGBUF_SHARED(buf) \
98 do \
99 { \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 } \
105 while (0)
3ee86942 106
56a3dcd4 107#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
108static size_t lenhist[1001];
109#endif
110
50b1996f
MG
111/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
112 characters. */
3ee86942
MV
113static SCM
114make_stringbuf (size_t len)
0f2d19dd 115{
3ee86942
MV
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
120 can be dropped.
121 */
122
ba54a202
LC
123 SCM buf;
124
56a3dcd4 125#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
126 if (len < 1000)
127 lenhist[len]++;
128 else
129 lenhist[1000]++;
130#endif
0f2d19dd 131
21041372 132 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
ba54a202
LC
133 "string"));
134
135 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
136 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
137
138 STRINGBUF_CHARS (buf)[len] = 0;
139
140 return buf;
3ee86942 141}
e53cc817 142
50b1996f
MG
143/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
144 characters. */
9c44cd45
MG
145static SCM
146make_wide_stringbuf (size_t len)
147{
ba54a202
LC
148 SCM buf;
149 size_t raw_len;
150
56a3dcd4 151#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156#endif
157
ba54a202 158 raw_len = (len + 1) * sizeof (scm_t_wchar);
21041372 159 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
ba54a202
LC
160 "string"));
161
162 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
163 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
164
165 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
166
167 return buf;
9c44cd45
MG
168}
169
ba54a202
LC
170/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
172static SCM
173wide_stringbuf (SCM buf)
9c44cd45 174{
ba54a202 175 SCM new_buf;
9c44cd45
MG
176
177 if (STRINGBUF_WIDE (buf))
ba54a202
LC
178 new_buf = buf;
179 else
9c44cd45 180 {
ba54a202
LC
181 size_t i, len;
182 scm_t_wchar *mem;
9c44cd45 183
ba54a202 184 len = STRINGBUF_LENGTH (buf);
9c44cd45 185
ba54a202 186 new_buf = make_wide_stringbuf (len);
9c44cd45 187
ba54a202 188 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 189 for (i = 0; i < len; i++)
ba54a202 190 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 191 mem[len] = 0;
9c44cd45 192 }
ba54a202
LC
193
194 return new_buf;
3ee86942 195}
bd9e24b3 196
ba54a202
LC
197/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
199static SCM
587a3355
MG
200narrow_stringbuf (SCM buf)
201{
ba54a202 202 SCM new_buf;
587a3355
MG
203
204 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
205 new_buf = buf;
206 else
207 {
208 size_t i, len;
209 scm_t_wchar *wmem;
210 unsigned char *mem;
587a3355 211
ba54a202
LC
212 len = STRINGBUF_LENGTH (buf);
213 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 214
ba54a202
LC
215 for (i = 0; i < len; i++)
216 if (wmem[i] > 0xFF)
217 /* BUF cannot be narrowed. */
218 return buf;
587a3355 219
ba54a202 220 new_buf = make_stringbuf (len);
587a3355 221
ba54a202
LC
222 mem = STRINGBUF_CHARS (new_buf);
223 for (i = 0; i < len; i++)
224 mem[i] = (unsigned char) wmem[i];
225 mem[len] = 0;
226 }
227
228 return new_buf;
587a3355
MG
229}
230
9de87eea 231scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 232
ba54a202 233\f
3ee86942
MV
234/* Copy-on-write strings.
235 */
bd9e24b3 236
3ee86942 237#define STRING_TAG scm_tc7_string
bd9e24b3 238
3ee86942
MV
239#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 242
3ee86942
MV
243#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
245
dc7da0be 246#define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
3ee86942 247
ed35de72
MV
248/* Read-only strings.
249 */
250
35920c00 251#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
252#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
253
e1b29f6a
MV
254/* Mutation-sharing substrings
255 */
256
257#define SH_STRING_TAG (scm_tc7_string + 0x100)
258
259#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260/* START and LENGTH as for STRINGs. */
261
262#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263
db071766
AW
264void
265scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate)
266{
267 SCM str;
268
269 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
270 SET_STRINGBUF_SHARED (exp);
271 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
272
273 str = scm_double_cell (RO_STRING_TAG, SCM_UNPACK(exp),
274 0, STRINGBUF_LENGTH (exp));
275
276 scm_puts ("#<stringbuf ", port);
277 scm_iprin1 (str, port, pstate);
278 scm_puts (">", port);
279}
280
e7efe8e7
AW
281SCM scm_nullstr;
282
50b1996f
MG
283/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
284 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
285 char array. If READ_ONLY_P, the returned string is read-only;
286 otherwise it is writable. */
3ee86942 287SCM
190d4b0d 288scm_i_make_string (size_t len, char **charsp, int read_only_p)
3ee86942 289{
69cd5299
MW
290 static SCM null_stringbuf = SCM_BOOL_F;
291 SCM buf;
3ee86942 292 SCM res;
69cd5299
MW
293
294 if (len == 0)
295 {
296 if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
297 {
298 null_stringbuf = make_stringbuf (0);
299 SET_STRINGBUF_SHARED (null_stringbuf);
300 }
301 buf = null_stringbuf;
302 }
303 else
304 buf = make_stringbuf (len);
305
3ee86942 306 if (charsp)
f59cf998 307 *charsp = (char *) STRINGBUF_CHARS (buf);
190d4b0d
LC
308 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
309 SCM_UNPACK (buf),
310 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 311 return res;
0f2d19dd
JB
312}
313
50b1996f
MG
314/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
315 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
316 character array. If READ_ONLY_P, the returned string is read-only;
317 otherwise it is writable. */
9c44cd45 318SCM
190d4b0d 319scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
9c44cd45
MG
320{
321 SCM buf = make_wide_stringbuf (len);
322 SCM res;
323 if (charsp)
324 *charsp = STRINGBUF_WIDE_CHARS (buf);
190d4b0d
LC
325 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
326 SCM_UNPACK (buf),
9c44cd45
MG
327 (scm_t_bits) 0, (scm_t_bits) len);
328 return res;
329}
330
3ee86942
MV
331static void
332validate_substring_args (SCM str, size_t start, size_t end)
333{
334 if (!IS_STRING (str))
335 scm_wrong_type_arg_msg (NULL, 0, str, "string");
336 if (start > STRING_LENGTH (str))
337 scm_out_of_range (NULL, scm_from_size_t (start));
338 if (end > STRING_LENGTH (str) || end < start)
339 scm_out_of_range (NULL, scm_from_size_t (end));
340}
0f2d19dd 341
e1b29f6a
MV
342static inline void
343get_str_buf_start (SCM *str, SCM *buf, size_t *start)
344{
345 *start = STRING_START (*str);
346 if (IS_SH_STRING (*str))
347 {
348 *str = SH_STRING_STRING (*str);
349 *start += STRING_START (*str);
350 }
351 *buf = STRING_STRINGBUF (*str);
352}
353
3ee86942
MV
354SCM
355scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 356{
17bec545
MW
357 if (start == end)
358 return scm_i_make_string (0, NULL, 0);
359 else
360 {
361 SCM buf;
362 size_t str_start;
363 get_str_buf_start (&str, &buf, &str_start);
364 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
365 SET_STRINGBUF_SHARED (buf);
366 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
367 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
368 (scm_t_bits)str_start + start,
369 (scm_t_bits) end - start);
370 }
0f2d19dd
JB
371}
372
ed35de72
MV
373SCM
374scm_i_substring_read_only (SCM str, size_t start, size_t end)
375{
17bec545
MW
376 if (start == end)
377 return scm_i_make_string (0, NULL, 1);
378 else
379 {
380 SCM buf;
381 size_t str_start;
382 get_str_buf_start (&str, &buf, &str_start);
383 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
384 SET_STRINGBUF_SHARED (buf);
385 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
386 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
387 (scm_t_bits)str_start + start,
388 (scm_t_bits) end - start);
389 }
ed35de72
MV
390}
391
3ee86942
MV
392SCM
393scm_i_substring_copy (SCM str, size_t start, size_t end)
394{
d5b75b6c
MW
395 if (start == end)
396 return scm_i_make_string (0, NULL, 0);
9c44cd45
MG
397 else
398 {
d5b75b6c
MW
399 size_t len = end - start;
400 SCM buf, my_buf, substr;
401 size_t str_start;
402 int wide = 0;
403 get_str_buf_start (&str, &buf, &str_start);
404 if (scm_i_is_narrow_string (str))
405 {
406 my_buf = make_stringbuf (len);
407 memcpy (STRINGBUF_CHARS (my_buf),
408 STRINGBUF_CHARS (buf) + str_start + start, len);
409 }
410 else
411 {
412 my_buf = make_wide_stringbuf (len);
413 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
414 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
415 + start), len);
416 wide = 1;
417 }
418 scm_remember_upto_here_1 (buf);
419 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
420 (scm_t_bits) 0, (scm_t_bits) len);
421 if (wide)
422 scm_i_try_narrow_string (substr);
423 return substr;
9c44cd45 424 }
3ee86942 425}
0f2d19dd 426
e1b29f6a
MV
427SCM
428scm_i_substring_shared (SCM str, size_t start, size_t end)
429{
430 if (start == 0 && end == STRING_LENGTH (str))
431 return str;
17bec545
MW
432 else if (start == end)
433 return scm_i_make_string (0, NULL, 0);
434 else
e1b29f6a
MV
435 {
436 size_t len = end - start;
437 if (IS_SH_STRING (str))
438 {
439 start += STRING_START (str);
440 str = SH_STRING_STRING (str);
441 }
442 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
443 (scm_t_bits)start, (scm_t_bits) len);
444 }
445}
446
3ee86942
MV
447SCM
448scm_c_substring (SCM str, size_t start, size_t end)
449{
450 validate_substring_args (str, start, end);
451 return scm_i_substring (str, start, end);
452}
ee149d03 453
ed35de72
MV
454SCM
455scm_c_substring_read_only (SCM str, size_t start, size_t end)
456{
457 validate_substring_args (str, start, end);
458 return scm_i_substring_read_only (str, start, end);
459}
460
0f2d19dd 461SCM
3ee86942 462scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 463{
3ee86942
MV
464 validate_substring_args (str, start, end);
465 return scm_i_substring_copy (str, start, end);
466}
467
3ee86942
MV
468SCM
469scm_c_substring_shared (SCM str, size_t start, size_t end)
470{
471 validate_substring_args (str, start, end);
472 return scm_i_substring_shared (str, start, end);
473}
0f2d19dd 474
d6c74168 475\f
3ee86942
MV
476/* Internal accessors
477 */
478
50b1996f
MG
479/* Returns the number of characters in STR. This may be different
480 than the memory size of the string storage. */
3ee86942
MV
481size_t
482scm_i_string_length (SCM str)
0f2d19dd 483{
3ee86942 484 return STRING_LENGTH (str);
0f2d19dd
JB
485}
486
50b1996f
MG
487/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
488 encoding. False if it is 'wide', having a 32-bit UCS-4
489 encoding. */
9c44cd45
MG
490int
491scm_i_is_narrow_string (SCM str)
492{
49d09292
MW
493 if (IS_SH_STRING (str))
494 str = SH_STRING_STRING (str);
495
9c44cd45
MG
496 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
497}
498
587a3355
MG
499/* Try to coerce a string to be narrow. It if is narrow already, do
500 nothing. If it is wide, shrink it to narrow if none of its
501 characters are above 0xFF. Return true if the string is narrow or
502 was made to be narrow. */
503int
504scm_i_try_narrow_string (SCM str)
505{
49d09292
MW
506 if (IS_SH_STRING (str))
507 str = SH_STRING_STRING (str);
508
ba54a202 509 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
510
511 return scm_i_is_narrow_string (str);
512}
513
100e20c7
LC
514/* Return a pointer to the raw data of the string, which can be either Latin-1
515 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
516const void *
517scm_i_string_data (SCM str)
518{
519 SCM buf;
520 size_t start;
521 const char *data;
522
523 get_str_buf_start (&str, &buf, &start);
524
525 data = STRINGBUF_CONTENTS (buf);
526 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
527
528 return data;
529}
530
50b1996f
MG
531/* Returns a pointer to the 8-bit Latin-1 encoded character array of
532 STR. */
3ee86942
MV
533const char *
534scm_i_string_chars (SCM str)
535{
536 SCM buf;
e1b29f6a
MV
537 size_t start;
538 get_str_buf_start (&str, &buf, &start);
9c44cd45 539 if (scm_i_is_narrow_string (str))
f59cf998 540 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
541 else
542 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
543 scm_list_1 (str));
544 return NULL;
3ee86942 545}
b00418df 546
50b1996f
MG
547/* Returns a pointer to the 32-bit UCS-4 encoded character array of
548 STR. */
9c44cd45
MG
549const scm_t_wchar *
550scm_i_string_wide_chars (SCM str)
551{
552 SCM buf;
553 size_t start;
554
555 get_str_buf_start (&str, &buf, &start);
556 if (!scm_i_is_narrow_string (str))
f59cf998 557 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
558 else
559 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
560 scm_list_1 (str));
561}
562
563/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
564 a new string buffer, so that it can be modified without modifying
50b1996f
MG
565 other strings. Also, lock the string mutex. Later, one must call
566 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
567SCM
568scm_i_string_start_writing (SCM orig_str)
b00418df 569{
ed35de72 570 SCM buf, str = orig_str;
e1b29f6a 571 size_t start;
ed35de72 572
e1b29f6a 573 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
574 if (IS_RO_STRING (str))
575 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
576
9de87eea 577 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
578 if (STRINGBUF_SHARED (buf))
579 {
9c44cd45 580 /* Clone the stringbuf. */
3ee86942
MV
581 size_t len = STRING_LENGTH (str);
582 SCM new_buf;
583
9de87eea 584 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 585
9c44cd45
MG
586 if (scm_i_is_narrow_string (str))
587 {
588 new_buf = make_stringbuf (len);
589 memcpy (STRINGBUF_CHARS (new_buf),
590 STRINGBUF_CHARS (buf) + STRING_START (str), len);
591
592 }
593 else
594 {
595 new_buf = make_wide_stringbuf (len);
596 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
597 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
598 + STRING_START (str)), len);
599 }
3ee86942 600
3ee86942
MV
601 SET_STRING_STRINGBUF (str, new_buf);
602 start -= STRING_START (str);
902578f1
LC
603
604 /* FIXME: The following operations are not atomic, so other threads
605 looking at STR may see an inconsistent state. Nevertheless it can't
606 hurt much since (i) accessing STR while it is being mutated can't
607 yield a crash, and (ii) concurrent accesses to STR should be
608 protected by a mutex at the application level. The latter may not
609 apply when STR != ORIG_STR, though. */
3ee86942 610 SET_STRING_START (str, 0);
902578f1 611 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
612
613 buf = new_buf;
614
9de87eea 615 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 616 }
9c44cd45
MG
617 return orig_str;
618}
3ee86942 619
50b1996f 620/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
621char *
622scm_i_string_writable_chars (SCM str)
623{
624 SCM buf;
625 size_t start;
3ee86942 626
9c44cd45
MG
627 get_str_buf_start (&str, &buf, &start);
628 if (scm_i_is_narrow_string (str))
f59cf998 629 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
630 else
631 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
632 scm_list_1 (str));
633 return NULL;
b00418df
DH
634}
635
50b1996f 636/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
637static scm_t_wchar *
638scm_i_string_writable_wide_chars (SCM str)
639{
640 SCM buf;
641 size_t start;
642
643 get_str_buf_start (&str, &buf, &start);
644 if (!scm_i_is_narrow_string (str))
645 return STRINGBUF_WIDE_CHARS (buf) + start;
646 else
1c7b216f 647 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 648 scm_list_1 (str));
b00418df
DH
649}
650
50b1996f
MG
651/* Unlock the string mutex that was locked when
652 scm_i_string_start_writing was called. */
3ee86942
MV
653void
654scm_i_string_stop_writing (void)
655{
9de87eea 656 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 657}
b00418df 658
50b1996f 659/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
660scm_t_wchar
661scm_i_string_ref (SCM str, size_t x)
662{
663 if (scm_i_is_narrow_string (str))
664 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
665 else
666 return scm_i_string_wide_chars (str)[x];
667}
668
889975e5
MG
669/* Returns index+1 of the first char in STR that matches C, or
670 0 if the char is not found. */
671int
672scm_i_string_contains_char (SCM str, char ch)
673{
674 size_t i;
675 size_t len = scm_i_string_length (str);
676
677 i = 0;
678 if (scm_i_is_narrow_string (str))
679 {
680 while (i < len)
681 {
682 if (scm_i_string_chars (str)[i] == ch)
683 return i+1;
684 i++;
685 }
686 }
687 else
688 {
689 while (i < len)
690 {
691 if (scm_i_string_wide_chars (str)[i]
692 == (unsigned char) ch)
693 return i+1;
694 i++;
695 }
696 }
697 return 0;
698}
699
3f47e526
MG
700int
701scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
702{
703 if (scm_i_is_narrow_string (sstr))
704 {
705 const char *a = scm_i_string_chars (sstr) + start_x;
706 const char *b = cstr;
707 return strncmp (a, b, strlen(b));
708 }
709 else
710 {
711 size_t i;
712 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
713 const char *b = cstr;
714 for (i = 0; i < strlen (b); i++)
715 {
716 if (a[i] != (unsigned char) b[i])
717 return 1;
718 }
719 }
720 return 0;
721}
722
50b1996f 723/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
724void
725scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
726{
49d09292
MW
727 if (IS_SH_STRING (str))
728 {
729 p += STRING_START (str);
730 str = SH_STRING_STRING (str);
731 }
732
9c44cd45 733 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 734 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
735
736 if (scm_i_is_narrow_string (str))
737 {
738 char *dst = scm_i_string_writable_chars (str);
587a3355 739 dst[p] = chr;
9c44cd45
MG
740 }
741 else
742 {
743 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
744 dst[p] = chr;
745 }
746}
747
ba54a202 748\f
3ee86942 749/* Symbols.
587a3355 750
3ee86942
MV
751 Basic symbol creation and accessing is done here, the rest is in
752 symbols.[hc]. This has been done to keep stringbufs and the
753 internals of strings and string-like objects confined to this file.
754*/
755
756#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
757
758SCM
6869328b
MV
759scm_i_make_symbol (SCM name, scm_t_bits flags,
760 unsigned long hash, SCM props)
3ee86942
MV
761{
762 SCM buf;
763 size_t start = STRING_START (name);
764 size_t length = STRING_LENGTH (name);
765
766 if (IS_SH_STRING (name))
767 {
768 name = SH_STRING_STRING (name);
769 start += STRING_START (name);
770 }
1a4d7653 771 buf = STRING_STRINGBUF (name);
3ee86942
MV
772
773 if (start == 0 && length == STRINGBUF_LENGTH (buf))
774 {
775 /* reuse buf. */
9de87eea 776 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 777 SET_STRINGBUF_SHARED (buf);
9de87eea 778 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
779 }
780 else
781 {
782 /* make new buf. */
9c44cd45
MG
783 if (scm_i_is_narrow_string (name))
784 {
785 SCM new_buf = make_stringbuf (length);
786 memcpy (STRINGBUF_CHARS (new_buf),
787 STRINGBUF_CHARS (buf) + start, length);
788 buf = new_buf;
789 }
790 else
791 {
792 SCM new_buf = make_wide_stringbuf (length);
793 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
794 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
795 length);
796 buf = new_buf;
797 }
3ee86942 798 }
6869328b 799 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
800 (scm_t_bits) hash, SCM_UNPACK (props));
801}
802
fd0a5bbc
HWN
803SCM
804scm_i_c_make_symbol (const char *name, size_t len,
805 scm_t_bits flags, unsigned long hash, SCM props)
806{
807 SCM buf = make_stringbuf (len);
808 memcpy (STRINGBUF_CHARS (buf), name, len);
809
65619ebe
AW
810 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
811 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
812}
813
50b1996f
MG
814/* Returns the number of characters in SYM. This may be different
815 from the memory size of SYM. */
3ee86942
MV
816size_t
817scm_i_symbol_length (SCM sym)
0f2d19dd 818{
3ee86942 819 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
820}
821
071bb6a8
LC
822size_t
823scm_c_symbol_length (SCM sym)
824#define FUNC_NAME "scm_c_symbol_length"
825{
826 SCM_VALIDATE_SYMBOL (1, sym);
827
828 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
829}
830#undef FUNC_NAME
831
50b1996f
MG
832/* True if the name of SYM is stored as a Latin-1 encoded string.
833 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
834int
835scm_i_is_narrow_symbol (SCM sym)
836{
837 SCM buf;
838
839 buf = SYMBOL_STRINGBUF (sym);
840 return !STRINGBUF_WIDE (buf);
841}
842
50b1996f
MG
843/* Returns a pointer to the 8-bit Latin-1 encoded character array that
844 contains the name of SYM. */
3ee86942
MV
845const char *
846scm_i_symbol_chars (SCM sym)
847{
9c44cd45
MG
848 SCM buf;
849
850 buf = SYMBOL_STRINGBUF (sym);
851 if (!STRINGBUF_WIDE (buf))
f59cf998 852 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
853 else
854 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
855 scm_list_1 (sym));
856}
857
50b1996f
MG
858/* Return a pointer to the 32-bit UCS-4-encoded character array of a
859 symbol's name. */
9c44cd45
MG
860const scm_t_wchar *
861scm_i_symbol_wide_chars (SCM sym)
862{
863 SCM buf;
864
865 buf = SYMBOL_STRINGBUF (sym);
866 if (STRINGBUF_WIDE (buf))
f59cf998 867 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
868 else
869 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
870 scm_list_1 (sym));
3ee86942 871}
1cc91f1b 872
be54b15d 873SCM
3ee86942 874scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 875{
3ee86942 876 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 877 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 878 SET_STRINGBUF_SHARED (buf);
9de87eea 879 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 880 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
881 (scm_t_bits)start, (scm_t_bits) end - start);
882}
be54b15d 883
50b1996f 884/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
885scm_t_wchar
886scm_i_symbol_ref (SCM sym, size_t x)
887{
888 if (scm_i_is_narrow_symbol (sym))
889 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
890 else
891 return scm_i_symbol_wide_chars (sym)[x];
892}
893
3ee86942
MV
894/* Debugging
895 */
be54b15d 896
6ce6923b
MG
897SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
898 "Returns an association list containing debugging information\n"
899 "for @var{str}. The association list has the following entries."
900 "@table @code\n"
901 "@item string\n"
902 "The string itself.\n"
903 "@item start\n"
904 "The start index of the string into its stringbuf\n"
905 "@item length\n"
906 "The length of the string\n"
907 "@item shared\n"
908 "If this string is a substring, it returns its parent string.\n"
909 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
910 "@item read-only\n"
911 "@code{#t} if the string is read-only\n"
6ce6923b
MG
912 "@item stringbuf-chars\n"
913 "A new string containing this string's stringbuf's characters\n"
914 "@item stringbuf-length\n"
915 "The number of characters in this stringbuf\n"
916 "@item stringbuf-shared\n"
917 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
918 "@item stringbuf-wide\n"
919 "@code{#t} if this stringbuf's characters are stored in a\n"
920 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
921 "buffer\n"
922 "@end table")
3ee86942
MV
923#define FUNC_NAME s_scm_sys_string_dump
924{
ba54a202 925 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 926 SCM buf;
3ee86942 927 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
928
929 /* String info */
4a655e50 930 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 931 str);
4a655e50 932 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 933 scm_from_size_t (STRING_START (str)));
4a655e50 934 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
935 scm_from_size_t (STRING_LENGTH (str)));
936
3ee86942
MV
937 if (IS_SH_STRING (str))
938 {
4a655e50 939 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
940 SH_STRING_STRING (str));
941 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
942 }
943 else
944 {
4a655e50 945 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
946 SCM_BOOL_F);
947 buf = STRING_STRINGBUF (str);
3ee86942 948 }
9c44cd45 949
88ed5759 950 if (IS_RO_STRING (str))
4a655e50 951 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
952 SCM_BOOL_T);
953 else
4a655e50 954 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 955 SCM_BOOL_F);
587a3355 956
6ce6923b 957 /* Stringbuf info */
6ce6923b
MG
958 if (!STRINGBUF_WIDE (buf))
959 {
960 size_t len = STRINGBUF_LENGTH (buf);
961 char *cbuf;
190d4b0d 962 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 963 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 964 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 965 sbc);
3ee86942 966 }
6ce6923b
MG
967 else
968 {
969 size_t len = STRINGBUF_LENGTH (buf);
970 scm_t_wchar *cbuf;
190d4b0d 971 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
972 u32_cpy ((scm_t_uint32 *) cbuf,
973 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 974 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
975 sbc);
976 }
4a655e50 977 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
978 scm_from_size_t (STRINGBUF_LENGTH (buf)));
979 if (STRINGBUF_SHARED (buf))
4a655e50 980 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
981 SCM_BOOL_T);
982 else
4a655e50 983 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 984 SCM_BOOL_F);
6ce6923b 985 if (STRINGBUF_WIDE (buf))
4a655e50 986 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 987 SCM_BOOL_T);
6ce6923b 988 else
4a655e50 989 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 990 SCM_BOOL_F);
6ce6923b 991
ba54a202 992 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
993}
994#undef FUNC_NAME
995
6ce6923b
MG
996SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
997 "Returns an association list containing debugging information\n"
998 "for @var{sym}. The association list has the following entries."
999 "@table @code\n"
1000 "@item symbol\n"
1001 "The symbol itself\n"
1002 "@item hash\n"
1003 "Its hash value\n"
88ed5759
MG
1004 "@item interned\n"
1005 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
1006 "@item stringbuf-chars\n"
1007 "A new string containing this symbols's stringbuf's characters\n"
1008 "@item stringbuf-length\n"
1009 "The number of characters in this stringbuf\n"
1010 "@item stringbuf-shared\n"
1011 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
1012 "@item stringbuf-wide\n"
1013 "@code{#t} if this stringbuf's characters are stored in a\n"
1014 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1015 "buffer\n"
1016 "@end table")
3ee86942
MV
1017#define FUNC_NAME s_scm_sys_symbol_dump
1018{
ba54a202 1019 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 1020 SCM buf;
3ee86942 1021 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 1022 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 1023 sym);
4a655e50 1024 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 1025 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 1026 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 1027 scm_symbol_interned_p (sym));
6ce6923b
MG
1028 buf = SYMBOL_STRINGBUF (sym);
1029
1030 /* Stringbuf info */
6ce6923b
MG
1031 if (!STRINGBUF_WIDE (buf))
1032 {
1033 size_t len = STRINGBUF_LENGTH (buf);
1034 char *cbuf;
190d4b0d 1035 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 1036 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 1037 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1038 sbc);
1039 }
9c44cd45 1040 else
6ce6923b
MG
1041 {
1042 size_t len = STRINGBUF_LENGTH (buf);
1043 scm_t_wchar *cbuf;
190d4b0d 1044 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
1045 u32_cpy ((scm_t_uint32 *) cbuf,
1046 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 1047 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1048 sbc);
1049 }
4a655e50 1050 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
1051 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1052 if (STRINGBUF_SHARED (buf))
4a655e50 1053 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
1054 SCM_BOOL_T);
1055 else
4a655e50 1056 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 1057 SCM_BOOL_F);
6ce6923b 1058 if (STRINGBUF_WIDE (buf))
4a655e50 1059 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
1060 SCM_BOOL_T);
1061 else
4a655e50 1062 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 1063 SCM_BOOL_F);
ba54a202 1064 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 1065
3ee86942
MV
1066}
1067#undef FUNC_NAME
1068
56a3dcd4 1069#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1070
9c44cd45 1071SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1072#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1073{
1074 int i;
1075 for (i = 0; i < 1000; i++)
1076 if (lenhist[i])
1077 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1078 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1079 return SCM_UNSPECIFIED;
be54b15d
DH
1080}
1081#undef FUNC_NAME
1082
3ee86942
MV
1083#endif
1084
1085\f
1086
1087SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1088 (SCM obj),
1089 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1090#define FUNC_NAME s_scm_string_p
1091{
1092 return scm_from_bool (IS_STRING (obj));
1093}
1094#undef FUNC_NAME
1095
1096
1097SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1098
1099SCM_DEFINE (scm_string, "string", 0, 0, 1,
1100 (SCM chrs),
1101 "@deffnx {Scheme Procedure} list->string chrs\n"
1102 "Return a newly allocated string composed of the arguments,\n"
1103 "@var{chrs}.")
1104#define FUNC_NAME s_scm_string
1105{
9aa27c1a 1106 SCM result = SCM_BOOL_F;
9c44cd45 1107 SCM rest;
3ee86942 1108 size_t len;
9c44cd45
MG
1109 size_t p = 0;
1110 long i;
9aa27c1a 1111 int wide = 0;
3ee86942 1112
9c44cd45
MG
1113 /* Verify that this is a list of chars. */
1114 i = scm_ilength (chrs);
3c7cf7f5 1115 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1116
9c44cd45
MG
1117 len = (size_t) i;
1118 rest = chrs;
3ee86942 1119
9c44cd45 1120 while (len > 0 && scm_is_pair (rest))
3ee86942 1121 {
9c44cd45 1122 SCM elt = SCM_CAR (rest);
3ee86942 1123 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1124 if (SCM_CHAR (elt) > 0xFF)
1125 wide = 1;
9c44cd45
MG
1126 rest = SCM_CDR (rest);
1127 len--;
1128 scm_remember_upto_here_1 (elt);
1129 }
1130
1131 /* Construct a string containing this list of chars. */
1132 len = (size_t) i;
1133 rest = chrs;
1134
9aa27c1a 1135 if (wide == 0)
9c44cd45 1136 {
56a3dcd4
LC
1137 char *buf;
1138
190d4b0d 1139 result = scm_i_make_string (len, NULL, 0);
9aa27c1a 1140 result = scm_i_string_start_writing (result);
56a3dcd4 1141 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1142 while (len > 0 && scm_is_pair (rest))
1143 {
1144 SCM elt = SCM_CAR (rest);
1145 buf[p] = (unsigned char) SCM_CHAR (elt);
1146 p++;
1147 rest = SCM_CDR (rest);
1148 len--;
1149 scm_remember_upto_here_1 (elt);
1150 }
1151 }
1152 else
1153 {
56a3dcd4
LC
1154 scm_t_wchar *buf;
1155
190d4b0d 1156 result = scm_i_make_wide_string (len, NULL, 0);
9aa27c1a 1157 result = scm_i_string_start_writing (result);
56a3dcd4 1158 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1159 while (len > 0 && scm_is_pair (rest))
1160 {
1161 SCM elt = SCM_CAR (rest);
1162 buf[p] = SCM_CHAR (elt);
1163 p++;
1164 rest = SCM_CDR (rest);
1165 len--;
1166 scm_remember_upto_here_1 (elt);
1167 }
3ee86942 1168 }
9c44cd45
MG
1169 scm_i_string_stop_writing ();
1170
3ee86942
MV
1171 if (len > 0)
1172 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1173 if (!scm_is_null (rest))
3ee86942
MV
1174 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1175
1176 return result;
1177}
1178#undef FUNC_NAME
be54b15d 1179
3b3b36dd 1180SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1181 (SCM k, SCM chr),
0d26a824
MG
1182 "Return a newly allocated string of\n"
1183 "length @var{k}. If @var{chr} is given, then all elements of\n"
1184 "the string are initialized to @var{chr}, otherwise the contents\n"
b7e64f8b 1185 "of the string are all set to @code{#\nul}.")
1bbd0b84 1186#define FUNC_NAME s_scm_make_string
0f2d19dd 1187{
3ee86942
MV
1188 return scm_c_make_string (scm_to_size_t (k), chr);
1189}
1190#undef FUNC_NAME
1191
1192SCM
1193scm_c_make_string (size_t len, SCM chr)
1194#define FUNC_NAME NULL
1195{
9c44cd45 1196 size_t p;
3ef6650d 1197 char *contents = NULL;
190d4b0d 1198 SCM res = scm_i_make_string (len, &contents, 0);
cb0d8be2 1199
3ef6650d
AW
1200 /* If no char is given, initialize string contents to NULL. */
1201 if (SCM_UNBNDP (chr))
1202 memset (contents, 0, len);
1203 else
e11e83f3 1204 {
3ee86942 1205 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1206 res = scm_i_string_start_writing (res);
1207 for (p = 0; p < len; p++)
1208 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1209 scm_i_string_stop_writing ();
0f2d19dd 1210 }
e11e83f3
MV
1211
1212 return res;
0f2d19dd 1213}
1bbd0b84 1214#undef FUNC_NAME
0f2d19dd 1215
3b3b36dd 1216SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1217 (SCM string),
1218 "Return the number of characters in @var{string}.")
1bbd0b84 1219#define FUNC_NAME s_scm_string_length
0f2d19dd 1220{
d1ca2c64 1221 SCM_VALIDATE_STRING (1, string);
3ee86942 1222 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1223}
1bbd0b84 1224#undef FUNC_NAME
0f2d19dd 1225
f8ba2bb9 1226SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1227 (SCM string),
1228 "Return the bytes used to represent a character in @var{string}."
1229 "This will return 1 or 4.")
f8ba2bb9 1230#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1231{
1232 SCM_VALIDATE_STRING (1, string);
1233 if (!scm_i_is_narrow_string (string))
1234 return scm_from_int (4);
1235
1236 return scm_from_int (1);
1237}
1238#undef FUNC_NAME
1239
3ee86942
MV
1240size_t
1241scm_c_string_length (SCM string)
1242{
1243 if (!IS_STRING (string))
1244 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1245 return STRING_LENGTH (string);
1246}
1247
bd9e24b3 1248SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1249 (SCM str, SCM k),
9c44cd45
MG
1250 "Return character @var{k} of @var{str} using zero-origin\n"
1251 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1252#define FUNC_NAME s_scm_string_ref
0f2d19dd 1253{
3ae3166b 1254 size_t len;
a55c2b68 1255 unsigned long idx;
bd9e24b3 1256
d1ca2c64 1257 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1258
1259 len = scm_i_string_length (str);
1260 if (SCM_LIKELY (len > 0))
1261 idx = scm_to_unsigned_integer (k, 0, len - 1);
1262 else
1263 scm_out_of_range (NULL, k);
1264
9c44cd45
MG
1265 if (scm_i_is_narrow_string (str))
1266 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1267 else
1268 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1269}
1bbd0b84 1270#undef FUNC_NAME
0f2d19dd 1271
3ee86942
MV
1272SCM
1273scm_c_string_ref (SCM str, size_t p)
1274{
1275 if (p >= scm_i_string_length (str))
1276 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1277 if (scm_i_is_narrow_string (str))
1278 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1279 else
1280 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1281
3ee86942 1282}
f0942910 1283
3b3b36dd 1284SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1285 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1286 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1287 "an unspecified value. @var{k} must be a valid index of\n"
1288 "@var{str}.")
1bbd0b84 1289#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1290{
3ae3166b 1291 size_t len;
a55c2b68
MV
1292 unsigned long idx;
1293
f0942910 1294 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1295
1296 len = scm_i_string_length (str);
1297 if (SCM_LIKELY (len > 0))
1298 idx = scm_to_unsigned_integer (k, 0, len - 1);
1299 else
1300 scm_out_of_range (NULL, k);
1301
34d19ef6 1302 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1303 str = scm_i_string_start_writing (str);
1304 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1305 scm_i_string_stop_writing ();
1306
0f2d19dd
JB
1307 return SCM_UNSPECIFIED;
1308}
1bbd0b84 1309#undef FUNC_NAME
0f2d19dd 1310
3ee86942
MV
1311void
1312scm_c_string_set_x (SCM str, size_t p, SCM chr)
1313{
1314 if (p >= scm_i_string_length (str))
1315 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1316 str = scm_i_string_start_writing (str);
1317 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1318 scm_i_string_stop_writing ();
3ee86942 1319}
0f2d19dd 1320
3b3b36dd 1321SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1322 (SCM str, SCM start, SCM end),
1323 "Return a newly allocated string formed from the characters\n"
1324 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1325 "ending with index @var{end} (exclusive).\n"
1326 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1327 "exact integers satisfying:\n\n"
1328 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1329#define FUNC_NAME s_scm_substring
0f2d19dd 1330{
3ee86942 1331 size_t len, from, to;
685c0d71 1332
d1ca2c64 1333 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1334 len = scm_i_string_length (str);
1335 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1336 if (SCM_UNBNDP (end))
3ee86942 1337 to = len;
a55c2b68 1338 else
3ee86942
MV
1339 to = scm_to_unsigned_integer (end, from, len);
1340 return scm_i_substring (str, from, to);
0f2d19dd 1341}
1bbd0b84 1342#undef FUNC_NAME
0f2d19dd 1343
ed35de72
MV
1344SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1345 (SCM str, SCM start, SCM end),
1346 "Return a newly allocated string formed from the characters\n"
1347 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1348 "ending with index @var{end} (exclusive).\n"
1349 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1350 "exact integers satisfying:\n"
1351 "\n"
1352 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1353 "\n"
1354 "The returned string is read-only.\n")
1355#define FUNC_NAME s_scm_substring_read_only
1356{
1357 size_t len, from, to;
1358
1359 SCM_VALIDATE_STRING (1, str);
1360 len = scm_i_string_length (str);
1361 from = scm_to_unsigned_integer (start, 0, len);
1362 if (SCM_UNBNDP (end))
1363 to = len;
1364 else
1365 to = scm_to_unsigned_integer (end, from, len);
1366 return scm_i_substring_read_only (str, from, to);
1367}
1368#undef FUNC_NAME
1369
3ee86942
MV
1370SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1371 (SCM str, SCM start, SCM end),
1372 "Return a newly allocated string formed from the characters\n"
1373 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1374 "ending with index @var{end} (exclusive).\n"
1375 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1376 "exact integers satisfying:\n\n"
1377 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1378#define FUNC_NAME s_scm_substring_copy
1379{
e1b29f6a
MV
1380 /* For the Scheme version, START is mandatory, but for the C
1381 version, it is optional. See scm_string_copy in srfi-13.c for a
1382 rationale.
1383 */
1384
1385 size_t from, to;
3ee86942
MV
1386
1387 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1388 scm_i_get_substring_spec (scm_i_string_length (str),
1389 start, &from, end, &to);
3ee86942
MV
1390 return scm_i_substring_copy (str, from, to);
1391}
1392#undef FUNC_NAME
1393
1394SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1395 (SCM str, SCM start, SCM end),
1396 "Return string that indirectly refers to the characters\n"
1397 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1398 "ending with index @var{end} (exclusive).\n"
1399 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1400 "exact integers satisfying:\n\n"
1401 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1402#define FUNC_NAME s_scm_substring_shared
1403{
1404 size_t len, from, to;
1405
1406 SCM_VALIDATE_STRING (1, str);
1407 len = scm_i_string_length (str);
1408 from = scm_to_unsigned_integer (start, 0, len);
1409 if (SCM_UNBNDP (end))
1410 to = len;
1411 else
1412 to = scm_to_unsigned_integer (end, from, len);
1413 return scm_i_substring_shared (str, from, to);
1414}
1415#undef FUNC_NAME
685c0d71 1416
3b3b36dd 1417SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1418 (SCM args),
9c44cd45 1419 "Return a newly allocated string whose characters form the\n"
0d26a824 1420 "concatenation of the given strings, @var{args}.")
1bbd0b84 1421#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1422{
1423 SCM res;
2f13a466
MW
1424 size_t total = 0;
1425 size_t len;
9c44cd45 1426 int wide = 0;
c829a427 1427 SCM l, s;
bd4911ef 1428 size_t i;
9909c395
MG
1429 union
1430 {
1431 char *narrow;
1432 scm_t_wchar *wide;
1433 } data;
af45e3b0
DH
1434
1435 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1436 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1437 {
1438 s = SCM_CAR (l);
1439 SCM_VALIDATE_STRING (SCM_ARGn, s);
2f13a466
MW
1440 len = scm_i_string_length (s);
1441 if (((size_t) -1) - total < len)
1442 scm_num_overflow (s_scm_string_append);
1443 total += len;
9c44cd45
MG
1444 if (!scm_i_is_narrow_string (s))
1445 wide = 1;
c829a427 1446 }
9909c395 1447 data.narrow = NULL;
9c44cd45 1448 if (!wide)
2f13a466 1449 res = scm_i_make_string (total, &data.narrow, 0);
9c44cd45 1450 else
2f13a466 1451 res = scm_i_make_wide_string (total, &data.wide, 0);
9c44cd45
MG
1452
1453 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1454 {
edea856c 1455 size_t len;
c829a427 1456 s = SCM_CAR (l);
3ee86942 1457 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1458 len = scm_i_string_length (s);
2f13a466
MW
1459 if (len > total)
1460 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
9c44cd45
MG
1461 if (!wide)
1462 {
9909c395
MG
1463 memcpy (data.narrow, scm_i_string_chars (s), len);
1464 data.narrow += len;
9c44cd45
MG
1465 }
1466 else
1467 {
1468 if (scm_i_is_narrow_string (s))
1469 {
2f13a466
MW
1470 const char *src = scm_i_string_chars (s);
1471 for (i = 0; i < len; i++)
1472 data.wide[i] = (unsigned char) src[i];
9c44cd45
MG
1473 }
1474 else
9909c395 1475 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1476 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1477 data.wide += len;
9c44cd45 1478 }
2f13a466 1479 total -= len;
c829a427
MV
1480 scm_remember_upto_here_1 (s);
1481 }
2f13a466
MW
1482 if (total != 0)
1483 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
0f2d19dd
JB
1484 return res;
1485}
1bbd0b84 1486#undef FUNC_NAME
0f2d19dd 1487
24933780 1488
a3d7d5d5 1489\f
c62da8f8 1490/* Charset conversion error handling. */
a3d7d5d5
LC
1491
1492SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
c62da8f8
LC
1493SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1494
6851d3be
LC
1495/* Raise an exception informing that character CHR could not be written
1496 to PORT in its current encoding. */
d14418a5 1497void
ef7e4ba3 1498scm_encoding_error (const char *subr, int err, const char *message,
6851d3be 1499 SCM port, SCM chr)
ef7e4ba3 1500{
ef7e4ba3 1501 scm_throw (scm_encoding_error_key,
d050ef66
AW
1502 scm_list_n (scm_from_latin1_string (subr),
1503 scm_from_latin1_string (message),
ef7e4ba3 1504 scm_from_int (err),
6851d3be 1505 port, chr,
ef7e4ba3 1506 SCM_UNDEFINED));
a3d7d5d5
LC
1507}
1508
c62da8f8
LC
1509/* Raise an exception informing of an encoding error on PORT. This
1510 means that a character could not be written in PORT's encoding. */
1511void
1512scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1513{
1514 scm_throw (scm_decoding_error_key,
d050ef66
AW
1515 scm_list_n (scm_from_latin1_string (subr),
1516 scm_from_latin1_string (message),
c62da8f8
LC
1517 scm_from_int (err),
1518 port,
1519 SCM_UNDEFINED));
1520}
1521
1522\f
1523/* String conversion to/from C. */
1524
41d1d984
AW
1525static void
1526decoding_error (const char *func_name, int errno_save,
1527 const char *str, size_t len)
1528{
1529 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1530 handler. */
1531 SCM bv;
1532 signed char *buf;
1533
1534 buf = scm_gc_malloc_pointerless (len, "bytevector");
1535 memcpy (buf, str, len);
1536 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1537
1538 scm_decoding_error (func_name, errno_save,
1539 "input locale conversion error", bv);
1540}
1541
fac32b51 1542SCM
587a3355
MG
1543scm_from_stringn (const char *str, size_t len, const char *encoding,
1544 scm_t_string_failed_conversion_handler handler)
1545{
1546 size_t u32len, i;
1547 scm_t_wchar *u32;
1548 int wide = 0;
1549 SCM res;
1550
d40e1ca8 1551 /* The order of these checks is important. */
a574564c 1552 if (!str && len != 0)
d40e1ca8
AW
1553 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1554 if (len == (size_t) -1)
1555 len = strlen (str);
fac32b51 1556
f6f4feb0 1557 if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
098818a1 1558 return scm_from_latin1_stringn (str, len);
f6f4feb0 1559 else if (c_strcasecmp (encoding, "UTF-8") == 0
8c76a897 1560 && handler == SCM_FAILED_CONVERSION_ERROR)
098818a1 1561 return scm_from_utf8_stringn (str, len);
889975e5 1562
587a3355
MG
1563 u32len = 0;
1564 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1565 (enum iconv_ilseq_handler)
1566 handler,
1567 str, len,
1568 NULL,
1569 NULL, &u32len);
1570
ef7e4ba3 1571 if (SCM_UNLIKELY (u32 == NULL))
41d1d984 1572 decoding_error (__func__, errno, str, len);
587a3355
MG
1573
1574 i = 0;
1575 while (i < u32len)
1576 if (u32[i++] > 0xFF)
1577 {
1578 wide = 1;
1579 break;
1580 }
1581
1582 if (!wide)
1583 {
1584 char *dst;
190d4b0d 1585 res = scm_i_make_string (u32len, &dst, 0);
587a3355
MG
1586 for (i = 0; i < u32len; i ++)
1587 dst[i] = (unsigned char) u32[i];
1588 dst[u32len] = '\0';
1589 }
1590 else
1591 {
1592 scm_t_wchar *wdst;
190d4b0d 1593 res = scm_i_make_wide_string (u32len, &wdst, 0);
587a3355
MG
1594 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1595 wdst[u32len] = 0;
1596 }
1597
1598 free (u32);
1599 return res;
1600}
1601
cf313a94 1602SCM
d40e1ca8 1603scm_from_locale_string (const char *str)
cf313a94 1604{
d40e1ca8 1605 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1606}
1607
c829a427
MV
1608SCM
1609scm_from_locale_stringn (const char *str, size_t len)
1610{
95f5e303 1611 return scm_from_stringn (str, len, locale_charset (),
b22e94db 1612 scm_i_default_port_conversion_handler ());
c829a427 1613}
4d4528e7 1614
c829a427 1615SCM
d40e1ca8 1616scm_from_latin1_string (const char *str)
4d4528e7 1617{
d40e1ca8
AW
1618 return scm_from_latin1_stringn (str, -1);
1619}
9c44cd45 1620
d40e1ca8
AW
1621SCM
1622scm_from_latin1_stringn (const char *str, size_t len)
1623{
e9a35a96
LC
1624 char *buf;
1625 SCM result;
1626
1627 if (len == (size_t) -1)
1628 len = strlen (str);
1629
1630 /* Make a narrow string and copy STR as is. */
190d4b0d 1631 result = scm_i_make_string (len, &buf, 0);
e9a35a96
LC
1632 memcpy (buf, str, len);
1633
1634 return result;
c829a427 1635}
4d4528e7 1636
587a3355 1637SCM
d40e1ca8 1638scm_from_utf8_string (const char *str)
587a3355 1639{
d40e1ca8
AW
1640 return scm_from_utf8_stringn (str, -1);
1641}
1642
1643SCM
1644scm_from_utf8_stringn (const char *str, size_t len)
1645{
41d1d984
AW
1646 size_t i, char_len;
1647 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1648 int ascii = 1, narrow = 1;
1649 SCM res;
1650
1651 if (len == (size_t) -1)
1652 len = strlen (str);
1653
1654 i = 0;
1655 char_len = 0;
1656
1657 while (i < len)
1658 {
1659 if (ustr[i] <= 127)
1660 {
1661 char_len++;
1662 i++;
1663 }
1664 else
1665 {
1666 ucs4_t c;
1667 int nbytes;
1668
1669 ascii = 0;
1670
1671 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1672
8c76a897 1673 if (c == 0xfffd)
41d1d984
AW
1674 /* Bad UTF-8. */
1675 decoding_error (__func__, errno, str, len);
1676
1677 if (c > 255)
1678 narrow = 0;
1679
1680 char_len++;
1681 i += nbytes;
1682 }
1683 }
1684
1685 if (ascii)
1686 {
1687 char *dst;
1688 res = scm_i_make_string (char_len, &dst, 0);
1689 memcpy (dst, str, len);
1690 }
1691 else if (narrow)
1692 {
1693 char *dst;
1694 size_t j;
1695 ucs4_t c;
1696
1697 res = scm_i_make_string (char_len, &dst, 0);
1698
33aadcab 1699 for (i = 0, j = 0; i < len; j++)
41d1d984
AW
1700 {
1701 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1702 dst[j] = (signed char) c;
1703 }
1704 }
1705 else
1706 {
1707 scm_t_wchar *dst;
1708 size_t j;
1709 ucs4_t c;
1710
1711 res = scm_i_make_wide_string (char_len, &dst, 0);
1712
33aadcab 1713 for (i = 0, j = 0; i < len; j++)
41d1d984
AW
1714 {
1715 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1716 dst[j] = c;
1717 }
1718 }
1719
1720 return res;
587a3355
MG
1721}
1722
647dc1ac
LC
1723SCM
1724scm_from_utf32_string (const scm_t_wchar *str)
1725{
1726 return scm_from_utf32_stringn (str, -1);
1727}
1728
1729SCM
1730scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1731{
1732 SCM result;
1733 scm_t_wchar *buf;
1734
1735 if (len == (size_t) -1)
1736 len = u32_strlen ((uint32_t *) str);
1737
190d4b0d 1738 result = scm_i_make_wide_string (len, &buf, 0);
647dc1ac
LC
1739 memcpy (buf, str, len * sizeof (scm_t_wchar));
1740 scm_i_try_narrow_string (result);
1741
1742 return result;
1743}
1744
08467a7e
AW
1745SCM
1746scm_from_port_string (const char *str, SCM port)
1747{
1748 return scm_from_port_stringn (str, -1, port);
1749}
1750
1751SCM
1752scm_from_port_stringn (const char *str, size_t len, SCM port)
1753{
1754 scm_t_port *pt = SCM_PTAB_ENTRY (port);
f6f4feb0 1755 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
08467a7e 1756
f6f4feb0 1757 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
08467a7e 1758 return scm_from_latin1_stringn (str, len);
f6f4feb0 1759 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
25752c4d
AW
1760 && (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
1761 || (u8_check ((uint8_t *) str, len) == NULL)))
08467a7e
AW
1762 return scm_from_utf8_stringn (str, len);
1763 else
1764 return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1765}
1766
50b1996f
MG
1767/* Create a new scheme string from the C string STR. The memory of
1768 STR may be used directly as storage for the new string. */
13a94556
LC
1769/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1770 would be to register a finalizer to eventually free(3) STR, which isn't
1771 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1772SCM
1773scm_take_locale_stringn (char *str, size_t len)
1774{
13a94556 1775 SCM res;
48ddf0d9 1776
13a94556
LC
1777 res = scm_from_locale_stringn (str, len);
1778 free (str);
c829a427 1779
c829a427
MV
1780 return res;
1781}
1782
48ddf0d9
KR
1783SCM
1784scm_take_locale_string (char *str)
1785{
1786 return scm_take_locale_stringn (str, -1);
1787}
1788
f1ee6d54
LC
1789/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1790 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1791 Set *LENP to the size of the resulting string.
1792
1793 FIXME: This is a hack we should get rid of. See
1794 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1795 for details. */
1796static void
1797unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1798{
1799 char *before, *after;
1800 size_t i, j;
1801
4ff2b9f4
LC
1802 before = buf;
1803 after = buf;
9c44cd45
MG
1804 i = 0;
1805 j = 0;
1806 while (i < *lenp)
1807 {
1808 if ((i <= *lenp - 6)
1809 && before[i] == '\\'
1810 && before[i + 1] == 'u'
1811 && before[i + 2] == '0' && before[i + 3] == '0')
1812 {
1813 /* Convert \u00NN to \xNN */
1814 after[j] = '\\';
1815 after[j + 1] = 'x';
30a6b9ca
MG
1816 after[j + 2] = tolower ((int) before[i + 4]);
1817 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1818 i += 6;
1819 j += 4;
1820 }
1821 else if ((i <= *lenp - 10)
1822 && before[i] == '\\'
1823 && before[i + 1] == 'U'
1824 && before[i + 2] == '0' && before[i + 3] == '0')
1825 {
1826 /* Convert \U00NNNNNN to \UNNNNNN */
1827 after[j] = '\\';
1828 after[j + 1] = 'U';
30a6b9ca
MG
1829 after[j + 2] = tolower ((int) before[i + 4]);
1830 after[j + 3] = tolower ((int) before[i + 5]);
1831 after[j + 4] = tolower ((int) before[i + 6]);
1832 after[j + 5] = tolower ((int) before[i + 7]);
1833 after[j + 6] = tolower ((int) before[i + 8]);
1834 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1835 i += 10;
1836 j += 8;
1837 }
1838 else
1839 {
1840 after[j] = before[i];
1841 i++;
1842 j++;
1843 }
1844 }
1845 *lenp = j;
9c44cd45
MG
1846}
1847
f1ee6d54
LC
1848/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1849 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1850 of the resulting string. BUF must be large enough to handle the
1851 worst case when `\uXXXX' escapes (6 characters) are replaced by
1852 `\xXXXX;' (7 characters). */
31d4d02b
LC
1853static void
1854unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1855{
1856 char *before, *after;
1857 size_t i, j;
1858 /* The worst case is if the input string contains all 4-digit hex escapes.
1859 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1860 size_t max_out_len = (*lenp * 7) / 6 + 1;
1861 size_t nzeros, ndigits;
1862
4ff2b9f4 1863 before = buf;
d31b9519
MG
1864 after = alloca (max_out_len);
1865 i = 0;
1866 j = 0;
1867 while (i < *lenp)
1868 {
1869 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1870 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1871 {
1872 if (before[i + 1] == 'u')
1873 ndigits = 4;
1874 else if (before[i + 1] == 'U')
1875 ndigits = 8;
1876 else
1877 abort ();
1878
1879 /* Add the R6RS hex escape initial sequence. */
1880 after[j] = '\\';
1881 after[j + 1] = 'x';
1882
1883 /* Move string positions to the start of the hex numbers. */
1884 i += 2;
1885 j += 2;
1886
1887 /* Find the number of initial zeros in this hex number. */
1888 nzeros = 0;
1889 while (before[i + nzeros] == '0' && nzeros < ndigits)
1890 nzeros++;
1891
1892 /* Copy the number, skipping initial zeros, and then move the string
1893 positions. */
1894 if (nzeros == ndigits)
1895 {
1896 after[j] = '0';
1897 i += ndigits;
1898 j += 1;
1899 }
1900 else
1901 {
1902 int pos;
1903 for (pos = 0; pos < ndigits - nzeros; pos++)
1904 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1905 i += ndigits;
1906 j += (ndigits - nzeros);
1907 }
1908
1909 /* Add terminating semicolon. */
1910 after[j] = ';';
1911 j++;
1912 }
1913 else
1914 {
1915 after[j] = before[i];
1916 i++;
1917 j++;
1918 }
1919 }
1920 *lenp = j;
d31b9519
MG
1921 memcpy (before, after, j);
1922}
1923
cf313a94 1924char *
d40e1ca8 1925scm_to_locale_string (SCM str)
cf313a94 1926{
d40e1ca8 1927 return scm_to_locale_stringn (str, NULL);
cf313a94 1928}
d31b9519 1929
c829a427 1930char *
fac32b51 1931scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1932{
b22e94db 1933 return scm_to_stringn (str, lenp,
95f5e303 1934 locale_charset (),
b22e94db 1935 scm_i_default_port_conversion_handler ());
9c44cd45
MG
1936}
1937
d40e1ca8
AW
1938char *
1939scm_to_latin1_string (SCM str)
1940{
1941 return scm_to_latin1_stringn (str, NULL);
1942}
1943
1944char *
1945scm_to_latin1_stringn (SCM str, size_t *lenp)
e9a35a96 1946#define FUNC_NAME "scm_to_latin1_stringn"
d40e1ca8 1947{
e9a35a96
LC
1948 char *result;
1949
1950 SCM_VALIDATE_STRING (1, str);
1951
1952 if (scm_i_is_narrow_string (str))
1953 {
fe133640
AW
1954 size_t len = scm_i_string_length (str);
1955
e9a35a96 1956 if (lenp)
fe133640 1957 *lenp = len;
e9a35a96 1958
fe133640 1959 result = scm_strndup (scm_i_string_data (str), len);
e9a35a96
LC
1960 }
1961 else
1962 result = scm_to_stringn (str, lenp, NULL,
fe133640 1963 SCM_FAILED_CONVERSION_ERROR);
e9a35a96
LC
1964
1965 return result;
d40e1ca8 1966}
e9a35a96 1967#undef FUNC_NAME
d40e1ca8
AW
1968
1969char *
1970scm_to_utf8_string (SCM str)
1971{
1972 return scm_to_utf8_stringn (str, NULL);
1973}
1974
e3d45974
AW
1975static size_t
1976latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1977{
1978 size_t ret, i;
1979 for (i = 0, ret = 0; i < len; i++)
1980 ret += (str[i] < 128) ? 1 : 2;
1981 return ret;
1982}
1983
1984static scm_t_uint8*
1985latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1986 scm_t_uint8 *u8_result, size_t *u8_lenp)
1987{
1988 size_t i, n;
1989 size_t u8_len = latin1_u8_strlen (str, latin_len);
1990
1991 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1992 u8_result = scm_malloc (u8_len + 1);
1993 if (u8_lenp)
1994 *u8_lenp = u8_len;
1995
1996 for (i = 0, n = 0; i < latin_len; i++)
1997 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
1998 if (n != u8_len)
1999 abort ();
2000 u8_result[n] = 0;
2001
2002 return u8_result;
2003}
2004
e26da7a2
MW
2005/* UTF-8 code table
2006
2007 (Note that this includes code points that are not allowed by Unicode,
2008 but since this function has no way to report an error, and its
2009 purpose is to determine the size of destination buffers for
2010 libunicode conversion functions, we err on the safe side and handle
2011 everything that libunicode might conceivably handle, now or in the
2012 future.)
2c1b7951
AW
2013
2014 Char. number range | UTF-8 octet sequence
2015 (hexadecimal) | (binary)
e26da7a2 2016 --------------------+------------------------------------------------------
2c1b7951
AW
2017 0000 0000-0000 007F | 0xxxxxxx
2018 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2019 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
e26da7a2
MW
2020 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2021 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2022 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2c1b7951
AW
2023*/
2024
2025static size_t
e26da7a2 2026u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2c1b7951
AW
2027{
2028 size_t ret, i;
2029
2030 for (i = 0, ret = 0; i < len; i++)
2031 {
2032 scm_t_uint32 c = str[i];
2033
2034 if (c <= 0x7f)
2035 ret += 1;
2036 else if (c <= 0x7ff)
2037 ret += 2;
2038 else if (c <= 0xffff)
2039 ret += 3;
e26da7a2 2040 else if (c <= 0x1fffff)
2c1b7951 2041 ret += 4;
e26da7a2
MW
2042 else if (c <= 0x3ffffff)
2043 ret += 5;
2044 else
2045 ret += 6;
2c1b7951
AW
2046 }
2047
2048 return ret;
2049}
2050
d40e1ca8
AW
2051char *
2052scm_to_utf8_stringn (SCM str, size_t *lenp)
ee26a9eb 2053#define FUNC_NAME "scm_to_utf8_stringn"
d40e1ca8 2054{
ee26a9eb
AW
2055 SCM_VALIDATE_STRING (1, str);
2056
e3d45974
AW
2057 if (scm_i_is_narrow_string (str))
2058 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2059 scm_i_string_length (str),
2060 NULL, lenp);
2061 else
2c1b7951 2062 {
bbb9f000 2063 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2c1b7951 2064 scm_t_uint8 *buf, *ret;
bbb9f000
MW
2065 size_t num_chars = scm_i_string_length (str);
2066 size_t num_bytes_predicted, num_bytes_actual;
2c1b7951 2067
bbb9f000 2068 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2c1b7951 2069
e26da7a2 2070 if (lenp)
2c1b7951 2071 {
bbb9f000
MW
2072 *lenp = num_bytes_predicted;
2073 buf = scm_malloc (num_bytes_predicted);
2c1b7951 2074 }
e26da7a2
MW
2075 else
2076 {
bbb9f000
MW
2077 buf = scm_malloc (num_bytes_predicted + 1);
2078 buf[num_bytes_predicted] = 0;
e26da7a2
MW
2079 }
2080
bbb9f000
MW
2081 num_bytes_actual = num_bytes_predicted;
2082 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
e26da7a2 2083
bbb9f000 2084 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
e26da7a2 2085 return (char *) ret;
2c1b7951
AW
2086
2087 /* An error: a bad codepoint. */
2088 {
2089 int saved_errno = errno;
2090
2091 free (buf);
2092 if (!saved_errno)
2093 abort ();
2094
2095 scm_decoding_error ("scm_to_utf8_stringn", errno,
2096 "invalid codepoint in string", str);
2097
2098 /* Not reached. */
2099 return NULL;
2100 }
2101 }
d40e1ca8 2102}
ee26a9eb 2103#undef FUNC_NAME
d40e1ca8 2104
647dc1ac
LC
2105scm_t_wchar *
2106scm_to_utf32_string (SCM str)
2107{
2108 return scm_to_utf32_stringn (str, NULL);
2109}
2110
2111scm_t_wchar *
2112scm_to_utf32_stringn (SCM str, size_t *lenp)
2113#define FUNC_NAME "scm_to_utf32_stringn"
2114{
2115 scm_t_wchar *result;
2116
2117 SCM_VALIDATE_STRING (1, str);
2118
2119 if (scm_i_is_narrow_string (str))
e3d45974
AW
2120 {
2121 scm_t_uint8 *codepoints;
2122 size_t i, len;
2123
2124 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2125 len = scm_i_string_length (str);
2126 if (lenp)
2127 *lenp = len;
2128
2129 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2130 for (i = 0; i < len; i++)
2131 result[i] = codepoints[i];
2132 result[len] = 0;
2133 }
647dc1ac
LC
2134 else
2135 {
2136 size_t len;
2137
2138 len = scm_i_string_length (str);
2139 if (lenp)
2140 *lenp = len;
2141
2142 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2143 memcpy (result, scm_i_string_wide_chars (str),
2144 len * sizeof (scm_t_wchar));
2145 result[len] = 0;
2146 }
2147
2148 return result;
2149}
2150#undef FUNC_NAME
2151
08467a7e
AW
2152char *
2153scm_to_port_string (SCM str, SCM port)
2154{
2155 return scm_to_port_stringn (str, NULL, port);
2156}
2157
2158char *
2159scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2160{
2161 scm_t_port *pt = SCM_PTAB_ENTRY (port);
f6f4feb0 2162 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
08467a7e 2163
f6f4feb0 2164 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
08467a7e
AW
2165 && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2166 return scm_to_latin1_stringn (str, lenp);
f6f4feb0 2167 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
08467a7e
AW
2168 return scm_to_utf8_stringn (str, lenp);
2169 else
2170 return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2171}
2172
29bcdbb0
LC
2173/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2174 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2175 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2176 defined by HANDLER. */
9c44cd45 2177char *
587a3355 2178scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 2179 scm_t_string_failed_conversion_handler handler)
9c44cd45 2180{
9c44cd45
MG
2181 char *buf;
2182 size_t ilen, len, i;
889975e5
MG
2183 int ret;
2184 const char *enc;
4d4528e7 2185
3ee86942 2186 if (!scm_is_string (str))
c829a427 2187 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2188 ilen = scm_i_string_length (str);
2189
2190 if (ilen == 0)
2191 {
2192 buf = scm_malloc (1);
2193 buf[0] = '\0';
2194 if (lenp)
2195 *lenp = 0;
2196 return buf;
2197 }
587a3355 2198
c829a427 2199 if (lenp == NULL)
9c44cd45
MG
2200 for (i = 0; i < ilen; i++)
2201 if (scm_i_string_ref (str, i) == '\0')
2202 scm_misc_error (NULL,
2203 "string contains #\\nul character: ~S",
2204 scm_list_1 (str));
2205
f6f4feb0
MW
2206 if (scm_i_is_narrow_string (str)
2207 && c_strcasecmp (encoding, "ISO-8859-1") == 0)
c829a427 2208 {
889975e5
MG
2209 /* If using native Latin-1 encoding, just copy the string
2210 contents. */
9c44cd45
MG
2211 if (lenp)
2212 {
2213 buf = scm_malloc (ilen);
2214 memcpy (buf, scm_i_string_chars (str), ilen);
2215 *lenp = ilen;
2216 return buf;
2217 }
2218 else
2219 {
2220 buf = scm_malloc (ilen + 1);
2221 memcpy (buf, scm_i_string_chars (str), ilen);
2222 buf[ilen] = '\0';
2223 return buf;
2224 }
c829a427 2225 }
9c44cd45 2226
587a3355 2227
9c44cd45
MG
2228 buf = NULL;
2229 len = 0;
889975e5
MG
2230 enc = encoding;
2231 if (enc == NULL)
2232 enc = "ISO-8859-1";
2233 if (scm_i_is_narrow_string (str))
2234 {
2235 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2236 "ISO-8859-1", enc,
2237 (enum iconv_ilseq_handler) handler, NULL,
2238 &buf, &len);
9c44cd45 2239
889975e5 2240 if (ret != 0)
ef7e4ba3 2241 scm_encoding_error (__func__, errno,
6851d3be
LC
2242 "cannot convert narrow string to output locale",
2243 SCM_BOOL_F,
2244 /* FIXME: Faulty character unknown. */
2245 SCM_BOOL_F);
889975e5
MG
2246 }
2247 else
2248 {
d31b9519 2249 buf = u32_conv_to_encoding (enc,
889975e5 2250 (enum iconv_ilseq_handler) handler,
d31b9519 2251 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
2252 ilen,
2253 NULL,
2254 NULL, &len);
2255 if (buf == NULL)
ef7e4ba3 2256 scm_encoding_error (__func__, errno,
6851d3be
LC
2257 "cannot convert wide string to output locale",
2258 SCM_BOOL_F,
2259 /* FIXME: Faulty character unknown. */
2260 SCM_BOOL_F);
d31b9519
MG
2261 }
2262 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2263 {
2264 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
2265 {
2266 /* The worst case is if the input string contains all 4-digit
2267 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2268 (seven characters). Make BUF large enough to hold
2269 that. */
2270 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 2271 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 2272 }
d31b9519 2273 else
31d4d02b 2274 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
2275
2276 buf = scm_realloc (buf, len);
889975e5 2277 }
9c44cd45 2278 if (lenp)
4d4528e7 2279 *lenp = len;
9c44cd45
MG
2280 else
2281 {
2282 buf = scm_realloc (buf, len + 1);
2283 buf[len] = '\0';
2284 }
24933780 2285
c829a427 2286 scm_remember_upto_here_1 (str);
9c44cd45 2287 return buf;
4d4528e7 2288}
af68e5e5 2289
c829a427
MV
2290size_t
2291scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2292{
2293 size_t len;
9c44cd45 2294 char *result = NULL;
3ee86942 2295 if (!scm_is_string (str))
c829a427 2296 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2297 result = scm_to_locale_stringn (str, &len);
2298
2299 memcpy (buf, result, (len > max_len) ? max_len : len);
2300 free (result);
2301
c829a427
MV
2302 scm_remember_upto_here_1 (str);
2303 return len;
2304}
af68e5e5 2305
a3d7d5d5
LC
2306\f
2307/* Unicode string normalization. */
2308
edb7bb47
JG
2309/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2310 libguile/i18n.c. It would be useful to have this factored out into a more
2311 convenient location, but its use of alloca makes that tricky to do. */
2312
2313static SCM
2314normalize_str (SCM string, uninorm_t form)
2315{
2316 SCM ret;
2317 scm_t_uint32 *w_str;
2318 scm_t_wchar *cbuf;
2319 size_t rlen, len = scm_i_string_length (string);
2320
2321 if (scm_i_is_narrow_string (string))
2322 {
2323 size_t i;
2324 const char *buf = scm_i_string_chars (string);
2325
2326 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2327
2328 for (i = 0; i < len; i ++)
2329 w_str[i] = (unsigned char) buf[i];
2330 w_str[len] = 0;
2331 }
d8164b04
JG
2332 else
2333 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2334
edb7bb47
JG
2335 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2336
190d4b0d 2337 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
edb7bb47
JG
2338 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2339 free (w_str);
d8164b04
JG
2340
2341 scm_i_try_narrow_string (ret);
2342
edb7bb47
JG
2343 return ret;
2344}
2345
2346SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2347 (SCM string),
2348 "Returns the NFC normalized form of @var{string}.")
2349#define FUNC_NAME s_scm_string_normalize_nfc
2350{
2351 SCM_VALIDATE_STRING (1, string);
2352 return normalize_str (string, UNINORM_NFC);
2353}
2354#undef FUNC_NAME
2355
2356SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2357 (SCM string),
2358 "Returns the NFD normalized form of @var{string}.")
2359#define FUNC_NAME s_scm_string_normalize_nfd
2360{
2361 SCM_VALIDATE_STRING (1, string);
2362 return normalize_str (string, UNINORM_NFD);
2363}
2364#undef FUNC_NAME
2365
2366SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2367 (SCM string),
2368 "Returns the NFKC normalized form of @var{string}.")
2369#define FUNC_NAME s_scm_string_normalize_nfkc
2370{
2371 SCM_VALIDATE_STRING (1, string);
2372 return normalize_str (string, UNINORM_NFKC);
2373}
2374#undef FUNC_NAME
2375
2376SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2377 (SCM string),
2378 "Returns the NFKD normalized form of @var{string}.")
2379#define FUNC_NAME s_scm_string_normalize_nfkd
2380{
2381 SCM_VALIDATE_STRING (1, string);
2382 return normalize_str (string, UNINORM_NFKD);
2383}
2384#undef FUNC_NAME
2385
7505c6e0
MW
2386/* converts C scm_array of strings to SCM scm_list of strings.
2387 If argc < 0, a null terminated scm_array is assumed.
2388 The current locale encoding is assumed */
9c44cd45 2389SCM
3ee86942
MV
2390scm_makfromstrs (int argc, char **argv)
2391{
2392 int i = argc;
2393 SCM lst = SCM_EOL;
2394 if (0 > i)
2395 for (i = 0; argv[i]; i++);
2396 while (i--)
2397 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2398 return lst;
2399}
2400
c829a427 2401/* Return a newly allocated array of char pointers to each of the strings
7505c6e0
MW
2402 in args, with a terminating NULL pointer. The strings are encoded using
2403 the current locale. */
c829a427
MV
2404
2405char **
2406scm_i_allocate_string_pointers (SCM list)
2a776823 2407#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 2408{
c829a427 2409 char **result;
7505c6e0 2410 int list_len = scm_ilength (list);
c829a427
MV
2411 int i;
2412
7505c6e0 2413 if (list_len < 0)
c829a427
MV
2414 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2415
7505c6e0 2416 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2a776823 2417 "string pointers");
7505c6e0 2418 result[list_len] = NULL;
c829a427 2419
7505c6e0 2420 /* The list might have been modified in another thread, so
c829a427
MV
2421 we check LIST before each access.
2422 */
7505c6e0 2423 for (i = 0; i < list_len && scm_is_pair (list); i++)
c829a427 2424 {
7505c6e0
MW
2425 SCM str = SCM_CAR (list);
2426 size_t len; /* String length in bytes */
2427 char *c_str = scm_to_locale_stringn (str, &len);
2428
2429 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2430 scm_malloc to allocate the returned string, which must be
2431 explicitly deallocated. This forces us to copy the string a
2432 second time into a new buffer. Ideally there would be variants
2433 of scm_to_*_stringn that can return garbage-collected buffers. */
2434
2435 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2436 memcpy (result[i], c_str, len);
2a776823 2437 result[i][len] = '\0';
7505c6e0 2438 free (c_str);
2a776823 2439
c829a427
MV
2440 list = SCM_CDR (list);
2441 }
2442
c829a427 2443 return result;
af68e5e5 2444}
2a776823 2445#undef FUNC_NAME
24933780 2446
6f14f578
MV
2447void
2448scm_i_get_substring_spec (size_t len,
2449 SCM start, size_t *cstart,
2450 SCM end, size_t *cend)
2451{
2452 if (SCM_UNBNDP (start))
2453 *cstart = 0;
2454 else
2455 *cstart = scm_to_unsigned_integer (start, 0, len);
2456
2457 if (SCM_UNBNDP (end))
2458 *cend = len;
2459 else
2460 *cend = scm_to_unsigned_integer (end, *cstart, len);
2461}
2462
2a610be5
AW
2463static SCM
2464string_handle_ref (scm_t_array_handle *h, size_t index)
2465{
2466 return scm_c_string_ref (h->array, index);
2467}
2468
2469static void
2470string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2471{
2472 scm_c_string_set_x (h->array, index, val);
2473}
2474
2475static void
2476string_get_handle (SCM v, scm_t_array_handle *h)
2477{
2478 h->array = v;
2479 h->ndims = 1;
2480 h->dims = &h->dim0;
2481 h->dim0.lbnd = 0;
2482 h->dim0.ubnd = scm_c_string_length (v) - 1;
2483 h->dim0.inc = 1;
2484 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2485 h->elements = h->writable_elements = NULL;
2486}
2487
c5f17102 2488SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2489 string_handle_ref, string_handle_set,
f65e0168
LC
2490 string_get_handle)
2491SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2492
0f2d19dd
JB
2493void
2494scm_init_strings ()
0f2d19dd 2495{
a7e392c1 2496 scm_nullstr = scm_i_make_string (0, NULL, 0);
7c33806a 2497
a0599745 2498#include "libguile/strings.x"
0f2d19dd
JB
2499}
2500
89e00824
ML
2501
2502/*
2503 Local Variables:
2504 c-file-style: "gnu"
2505 End:
2506*/