Adapt GDB integration to newest patches
[bpt/guile.git] / libguile / strings.c
CommitLineData
cf64dca6 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
f6f4feb0 32#include <c-strcase.h>
faf2c9d7 33
889975e5
MG
34#include "striconveh.h"
35
a0599745
MD
36#include "libguile/_scm.h"
37#include "libguile/chars.h"
7c33806a 38#include "libguile/root.h"
a0599745 39#include "libguile/strings.h"
f6f4feb0
MW
40#include "libguile/ports.h"
41#include "libguile/ports-internal.h"
a3d7d5d5 42#include "libguile/error.h"
f45eccff 43#include "libguile/generalized-vectors.h"
1afff620 44#include "libguile/deprecation.h"
a0599745 45#include "libguile/validate.h"
d31b9519 46#include "libguile/private-options.h"
1afff620 47
0f2d19dd
JB
48\f
49
50/* {Strings}
51 */
52
3ee86942
MV
53
54/* Stringbufs
55 *
56 * XXX - keeping an accurate refcount during GC seems to be quite
57 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 58 * shared, not whether it definitely is.
3ee86942
MV
59 *
60 * The scheme I (mvo) tried to keep an accurate reference count would
61 * recount all strings that point to a stringbuf during the mark-phase
62 * of the GC. This was done since one cannot access the stringbuf of
63 * a string when that string is freed (in order to decrease the
64 * reference count). The memory of the stringbuf might have been
65 * reused already for something completely different.
66 *
67 * This recounted worked for a small number of threads beating on
68 * cow-strings, but it failed randomly with more than 10 threads, say.
69 * I couldn't figure out what went wrong, so I used the conservative
70 * approach implemented below.
50b1996f 71 *
ba54a202
LC
72 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
73 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 * strings.
3ee86942
MV
75 */
76
ba54a202
LC
77/* The size in words of the stringbuf header (type tag + size). */
78#define STRINGBUF_HEADER_SIZE 2U
79
80#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
81
35920c00 82#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 83#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
84
85#define STRINGBUF_TAG scm_tc7_stringbuf
86#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 87#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 88
100e20c7 89#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
90 SCM_CELL_OBJECT_LOC (buf, \
91 STRINGBUF_HEADER_SIZE))
100e20c7
LC
92#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
93#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 94
100e20c7 95#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 96
9b41542f
LC
97#define SET_STRINGBUF_SHARED(buf) \
98 do \
99 { \
100 /* Don't modify BUF if it's already marked as shared since it might be \
101 a read-only, statically allocated stringbuf. */ \
102 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
103 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 } \
105 while (0)
3ee86942 106
56a3dcd4 107#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
108static size_t lenhist[1001];
109#endif
110
50b1996f
MG
111/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
112 characters. */
3ee86942
MV
113static SCM
114make_stringbuf (size_t len)
0f2d19dd 115{
3ee86942
MV
116 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
117 scm_i_symbol_chars, all stringbufs are null-terminated. Once
118 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
119 has been changed for scm_i_symbol_chars, this null-termination
120 can be dropped.
121 */
122
ba54a202
LC
123 SCM buf;
124
56a3dcd4 125#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
126 if (len < 1000)
127 lenhist[len]++;
128 else
129 lenhist[1000]++;
130#endif
0f2d19dd 131
21041372 132 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
ba54a202
LC
133 "string"));
134
135 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
136 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
137
138 STRINGBUF_CHARS (buf)[len] = 0;
139
140 return buf;
3ee86942 141}
e53cc817 142
50b1996f
MG
143/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
144 characters. */
9c44cd45
MG
145static SCM
146make_wide_stringbuf (size_t len)
147{
ba54a202
LC
148 SCM buf;
149 size_t raw_len;
150
56a3dcd4 151#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
152 if (len < 1000)
153 lenhist[len]++;
154 else
155 lenhist[1000]++;
156#endif
157
ba54a202 158 raw_len = (len + 1) * sizeof (scm_t_wchar);
21041372 159 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
ba54a202
LC
160 "string"));
161
162 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
163 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
164
165 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
166
167 return buf;
9c44cd45
MG
168}
169
ba54a202
LC
170/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
171 characters from BUF. */
172static SCM
173wide_stringbuf (SCM buf)
9c44cd45 174{
ba54a202 175 SCM new_buf;
9c44cd45
MG
176
177 if (STRINGBUF_WIDE (buf))
ba54a202
LC
178 new_buf = buf;
179 else
9c44cd45 180 {
ba54a202
LC
181 size_t i, len;
182 scm_t_wchar *mem;
9c44cd45 183
ba54a202 184 len = STRINGBUF_LENGTH (buf);
9c44cd45 185
ba54a202 186 new_buf = make_wide_stringbuf (len);
9c44cd45 187
ba54a202 188 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 189 for (i = 0; i < len; i++)
ba54a202 190 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 191 mem[len] = 0;
9c44cd45 192 }
ba54a202
LC
193
194 return new_buf;
3ee86942 195}
bd9e24b3 196
ba54a202
LC
197/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
198 characters from BUF, if possible. */
199static SCM
587a3355
MG
200narrow_stringbuf (SCM buf)
201{
ba54a202 202 SCM new_buf;
587a3355
MG
203
204 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
205 new_buf = buf;
206 else
207 {
208 size_t i, len;
209 scm_t_wchar *wmem;
210 unsigned char *mem;
587a3355 211
ba54a202
LC
212 len = STRINGBUF_LENGTH (buf);
213 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 214
ba54a202
LC
215 for (i = 0; i < len; i++)
216 if (wmem[i] > 0xFF)
217 /* BUF cannot be narrowed. */
218 return buf;
587a3355 219
ba54a202 220 new_buf = make_stringbuf (len);
587a3355 221
ba54a202
LC
222 mem = STRINGBUF_CHARS (new_buf);
223 for (i = 0; i < len; i++)
224 mem[i] = (unsigned char) wmem[i];
225 mem[len] = 0;
226 }
227
228 return new_buf;
587a3355
MG
229}
230
9de87eea 231scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 232
ba54a202 233\f
3ee86942
MV
234/* Copy-on-write strings.
235 */
bd9e24b3 236
3ee86942 237#define STRING_TAG scm_tc7_string
bd9e24b3 238
3ee86942
MV
239#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
240#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
241#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 242
3ee86942
MV
243#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
244#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
245
dc7da0be 246#define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
3ee86942 247
ed35de72
MV
248/* Read-only strings.
249 */
250
35920c00 251#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
252#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
253
e1b29f6a
MV
254/* Mutation-sharing substrings
255 */
256
257#define SH_STRING_TAG (scm_tc7_string + 0x100)
258
259#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
260/* START and LENGTH as for STRINGs. */
261
262#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263
db071766
AW
264void
265scm_i_print_stringbuf (SCM exp, SCM port, scm_print_state *pstate)
266{
267 SCM str;
268
269 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
270 SET_STRINGBUF_SHARED (exp);
271 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
272
273 str = scm_double_cell (RO_STRING_TAG, SCM_UNPACK(exp),
274 0, STRINGBUF_LENGTH (exp));
275
276 scm_puts ("#<stringbuf ", port);
277 scm_iprin1 (str, port, pstate);
278 scm_puts (">", port);
279}
280
e7efe8e7
AW
281SCM scm_nullstr;
282
60617d81
MW
283static SCM null_stringbuf;
284
285static void
286init_null_stringbuf (void)
287{
288 null_stringbuf = make_stringbuf (0);
289 SET_STRINGBUF_SHARED (null_stringbuf);
290}
291
50b1996f
MG
292/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
293 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
294 char array. If READ_ONLY_P, the returned string is read-only;
295 otherwise it is writable. */
3ee86942 296SCM
190d4b0d 297scm_i_make_string (size_t len, char **charsp, int read_only_p)
3ee86942 298{
69cd5299 299 SCM buf;
3ee86942 300 SCM res;
69cd5299
MW
301
302 if (len == 0)
303 {
60617d81
MW
304 static scm_i_pthread_once_t once = SCM_I_PTHREAD_ONCE_INIT;
305 scm_i_pthread_once (&once, init_null_stringbuf);
69cd5299
MW
306 buf = null_stringbuf;
307 }
308 else
309 buf = make_stringbuf (len);
310
3ee86942 311 if (charsp)
f59cf998 312 *charsp = (char *) STRINGBUF_CHARS (buf);
190d4b0d
LC
313 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
314 SCM_UNPACK (buf),
315 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 316 return res;
0f2d19dd
JB
317}
318
50b1996f
MG
319/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
320 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
321 character array. If READ_ONLY_P, the returned string is read-only;
322 otherwise it is writable. */
9c44cd45 323SCM
190d4b0d 324scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
9c44cd45
MG
325{
326 SCM buf = make_wide_stringbuf (len);
327 SCM res;
328 if (charsp)
329 *charsp = STRINGBUF_WIDE_CHARS (buf);
190d4b0d
LC
330 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
331 SCM_UNPACK (buf),
9c44cd45
MG
332 (scm_t_bits) 0, (scm_t_bits) len);
333 return res;
334}
335
3ee86942
MV
336static void
337validate_substring_args (SCM str, size_t start, size_t end)
338{
339 if (!IS_STRING (str))
340 scm_wrong_type_arg_msg (NULL, 0, str, "string");
341 if (start > STRING_LENGTH (str))
342 scm_out_of_range (NULL, scm_from_size_t (start));
343 if (end > STRING_LENGTH (str) || end < start)
344 scm_out_of_range (NULL, scm_from_size_t (end));
345}
0f2d19dd 346
e1b29f6a
MV
347static inline void
348get_str_buf_start (SCM *str, SCM *buf, size_t *start)
349{
350 *start = STRING_START (*str);
351 if (IS_SH_STRING (*str))
352 {
353 *str = SH_STRING_STRING (*str);
354 *start += STRING_START (*str);
355 }
356 *buf = STRING_STRINGBUF (*str);
357}
358
3ee86942
MV
359SCM
360scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 361{
17bec545
MW
362 if (start == end)
363 return scm_i_make_string (0, NULL, 0);
364 else
365 {
366 SCM buf;
367 size_t str_start;
368 get_str_buf_start (&str, &buf, &str_start);
369 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
370 SET_STRINGBUF_SHARED (buf);
371 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
372 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
373 (scm_t_bits)str_start + start,
374 (scm_t_bits) end - start);
375 }
0f2d19dd
JB
376}
377
ed35de72
MV
378SCM
379scm_i_substring_read_only (SCM str, size_t start, size_t end)
380{
17bec545
MW
381 if (start == end)
382 return scm_i_make_string (0, NULL, 1);
383 else
384 {
385 SCM buf;
386 size_t str_start;
387 get_str_buf_start (&str, &buf, &str_start);
388 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
389 SET_STRINGBUF_SHARED (buf);
390 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
391 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
392 (scm_t_bits)str_start + start,
393 (scm_t_bits) end - start);
394 }
ed35de72
MV
395}
396
3ee86942
MV
397SCM
398scm_i_substring_copy (SCM str, size_t start, size_t end)
399{
d5b75b6c
MW
400 if (start == end)
401 return scm_i_make_string (0, NULL, 0);
9c44cd45
MG
402 else
403 {
d5b75b6c
MW
404 size_t len = end - start;
405 SCM buf, my_buf, substr;
406 size_t str_start;
407 int wide = 0;
408 get_str_buf_start (&str, &buf, &str_start);
409 if (scm_i_is_narrow_string (str))
410 {
411 my_buf = make_stringbuf (len);
412 memcpy (STRINGBUF_CHARS (my_buf),
413 STRINGBUF_CHARS (buf) + str_start + start, len);
414 }
415 else
416 {
417 my_buf = make_wide_stringbuf (len);
418 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
419 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
420 + start), len);
421 wide = 1;
422 }
423 scm_remember_upto_here_1 (buf);
424 substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
425 (scm_t_bits) 0, (scm_t_bits) len);
426 if (wide)
427 scm_i_try_narrow_string (substr);
428 return substr;
9c44cd45 429 }
3ee86942 430}
0f2d19dd 431
e1b29f6a
MV
432SCM
433scm_i_substring_shared (SCM str, size_t start, size_t end)
434{
435 if (start == 0 && end == STRING_LENGTH (str))
436 return str;
17bec545
MW
437 else if (start == end)
438 return scm_i_make_string (0, NULL, 0);
439 else
e1b29f6a
MV
440 {
441 size_t len = end - start;
442 if (IS_SH_STRING (str))
443 {
444 start += STRING_START (str);
445 str = SH_STRING_STRING (str);
446 }
447 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
448 (scm_t_bits)start, (scm_t_bits) len);
449 }
450}
451
3ee86942
MV
452SCM
453scm_c_substring (SCM str, size_t start, size_t end)
454{
455 validate_substring_args (str, start, end);
456 return scm_i_substring (str, start, end);
457}
ee149d03 458
ed35de72
MV
459SCM
460scm_c_substring_read_only (SCM str, size_t start, size_t end)
461{
462 validate_substring_args (str, start, end);
463 return scm_i_substring_read_only (str, start, end);
464}
465
0f2d19dd 466SCM
3ee86942 467scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 468{
3ee86942
MV
469 validate_substring_args (str, start, end);
470 return scm_i_substring_copy (str, start, end);
471}
472
3ee86942
MV
473SCM
474scm_c_substring_shared (SCM str, size_t start, size_t end)
475{
476 validate_substring_args (str, start, end);
477 return scm_i_substring_shared (str, start, end);
478}
0f2d19dd 479
d6c74168 480\f
3ee86942
MV
481/* Internal accessors
482 */
483
50b1996f
MG
484/* Returns the number of characters in STR. This may be different
485 than the memory size of the string storage. */
3ee86942
MV
486size_t
487scm_i_string_length (SCM str)
0f2d19dd 488{
3ee86942 489 return STRING_LENGTH (str);
0f2d19dd
JB
490}
491
50b1996f
MG
492/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
493 encoding. False if it is 'wide', having a 32-bit UCS-4
494 encoding. */
9c44cd45
MG
495int
496scm_i_is_narrow_string (SCM str)
497{
49d09292
MW
498 if (IS_SH_STRING (str))
499 str = SH_STRING_STRING (str);
500
9c44cd45
MG
501 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
502}
503
587a3355
MG
504/* Try to coerce a string to be narrow. It if is narrow already, do
505 nothing. If it is wide, shrink it to narrow if none of its
506 characters are above 0xFF. Return true if the string is narrow or
507 was made to be narrow. */
508int
509scm_i_try_narrow_string (SCM str)
510{
49d09292
MW
511 if (IS_SH_STRING (str))
512 str = SH_STRING_STRING (str);
513
ba54a202 514 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
515
516 return scm_i_is_narrow_string (str);
517}
518
100e20c7
LC
519/* Return a pointer to the raw data of the string, which can be either Latin-1
520 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
521const void *
522scm_i_string_data (SCM str)
523{
524 SCM buf;
525 size_t start;
526 const char *data;
527
528 get_str_buf_start (&str, &buf, &start);
529
530 data = STRINGBUF_CONTENTS (buf);
531 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
532
533 return data;
534}
535
50b1996f
MG
536/* Returns a pointer to the 8-bit Latin-1 encoded character array of
537 STR. */
3ee86942
MV
538const char *
539scm_i_string_chars (SCM str)
540{
541 SCM buf;
e1b29f6a
MV
542 size_t start;
543 get_str_buf_start (&str, &buf, &start);
9c44cd45 544 if (scm_i_is_narrow_string (str))
f59cf998 545 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
546 else
547 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
548 scm_list_1 (str));
549 return NULL;
3ee86942 550}
b00418df 551
50b1996f
MG
552/* Returns a pointer to the 32-bit UCS-4 encoded character array of
553 STR. */
9c44cd45
MG
554const scm_t_wchar *
555scm_i_string_wide_chars (SCM str)
556{
557 SCM buf;
558 size_t start;
559
560 get_str_buf_start (&str, &buf, &start);
561 if (!scm_i_is_narrow_string (str))
f59cf998 562 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
563 else
564 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
565 scm_list_1 (str));
566}
567
568/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
569 a new string buffer, so that it can be modified without modifying
50b1996f
MG
570 other strings. Also, lock the string mutex. Later, one must call
571 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
572SCM
573scm_i_string_start_writing (SCM orig_str)
b00418df 574{
ed35de72 575 SCM buf, str = orig_str;
e1b29f6a 576 size_t start;
ed35de72 577
e1b29f6a 578 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
579 if (IS_RO_STRING (str))
580 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
581
9de87eea 582 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
583 if (STRINGBUF_SHARED (buf))
584 {
9c44cd45 585 /* Clone the stringbuf. */
3ee86942
MV
586 size_t len = STRING_LENGTH (str);
587 SCM new_buf;
588
9de87eea 589 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 590
9c44cd45
MG
591 if (scm_i_is_narrow_string (str))
592 {
593 new_buf = make_stringbuf (len);
594 memcpy (STRINGBUF_CHARS (new_buf),
595 STRINGBUF_CHARS (buf) + STRING_START (str), len);
596
597 }
598 else
599 {
600 new_buf = make_wide_stringbuf (len);
601 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
602 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
603 + STRING_START (str)), len);
604 }
3ee86942 605
3ee86942
MV
606 SET_STRING_STRINGBUF (str, new_buf);
607 start -= STRING_START (str);
902578f1
LC
608
609 /* FIXME: The following operations are not atomic, so other threads
610 looking at STR may see an inconsistent state. Nevertheless it can't
611 hurt much since (i) accessing STR while it is being mutated can't
612 yield a crash, and (ii) concurrent accesses to STR should be
613 protected by a mutex at the application level. The latter may not
614 apply when STR != ORIG_STR, though. */
3ee86942 615 SET_STRING_START (str, 0);
902578f1 616 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
617
618 buf = new_buf;
619
9de87eea 620 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 621 }
9c44cd45
MG
622 return orig_str;
623}
3ee86942 624
50b1996f 625/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
626char *
627scm_i_string_writable_chars (SCM str)
628{
629 SCM buf;
630 size_t start;
3ee86942 631
9c44cd45
MG
632 get_str_buf_start (&str, &buf, &start);
633 if (scm_i_is_narrow_string (str))
f59cf998 634 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
635 else
636 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
637 scm_list_1 (str));
638 return NULL;
b00418df
DH
639}
640
50b1996f 641/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
642static scm_t_wchar *
643scm_i_string_writable_wide_chars (SCM str)
644{
645 SCM buf;
646 size_t start;
647
648 get_str_buf_start (&str, &buf, &start);
649 if (!scm_i_is_narrow_string (str))
650 return STRINGBUF_WIDE_CHARS (buf) + start;
651 else
1c7b216f 652 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 653 scm_list_1 (str));
b00418df
DH
654}
655
50b1996f
MG
656/* Unlock the string mutex that was locked when
657 scm_i_string_start_writing was called. */
3ee86942
MV
658void
659scm_i_string_stop_writing (void)
660{
9de87eea 661 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 662}
b00418df 663
50b1996f 664/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
665scm_t_wchar
666scm_i_string_ref (SCM str, size_t x)
667{
668 if (scm_i_is_narrow_string (str))
669 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
670 else
671 return scm_i_string_wide_chars (str)[x];
672}
673
889975e5
MG
674/* Returns index+1 of the first char in STR that matches C, or
675 0 if the char is not found. */
676int
677scm_i_string_contains_char (SCM str, char ch)
678{
679 size_t i;
680 size_t len = scm_i_string_length (str);
681
682 i = 0;
683 if (scm_i_is_narrow_string (str))
684 {
685 while (i < len)
686 {
687 if (scm_i_string_chars (str)[i] == ch)
688 return i+1;
689 i++;
690 }
691 }
692 else
693 {
694 while (i < len)
695 {
696 if (scm_i_string_wide_chars (str)[i]
697 == (unsigned char) ch)
698 return i+1;
699 i++;
700 }
701 }
702 return 0;
703}
704
3f47e526
MG
705int
706scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
707{
708 if (scm_i_is_narrow_string (sstr))
709 {
710 const char *a = scm_i_string_chars (sstr) + start_x;
711 const char *b = cstr;
712 return strncmp (a, b, strlen(b));
713 }
714 else
715 {
716 size_t i;
717 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
718 const char *b = cstr;
719 for (i = 0; i < strlen (b); i++)
720 {
721 if (a[i] != (unsigned char) b[i])
722 return 1;
723 }
724 }
725 return 0;
726}
727
50b1996f 728/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
729void
730scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
731{
49d09292
MW
732 if (IS_SH_STRING (str))
733 {
734 p += STRING_START (str);
735 str = SH_STRING_STRING (str);
736 }
737
9c44cd45 738 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 739 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
740
741 if (scm_i_is_narrow_string (str))
742 {
743 char *dst = scm_i_string_writable_chars (str);
587a3355 744 dst[p] = chr;
9c44cd45
MG
745 }
746 else
747 {
748 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
749 dst[p] = chr;
750 }
751}
752
ba54a202 753\f
3ee86942 754/* Symbols.
587a3355 755
3ee86942
MV
756 Basic symbol creation and accessing is done here, the rest is in
757 symbols.[hc]. This has been done to keep stringbufs and the
758 internals of strings and string-like objects confined to this file.
759*/
760
761#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
762
763SCM
6869328b
MV
764scm_i_make_symbol (SCM name, scm_t_bits flags,
765 unsigned long hash, SCM props)
3ee86942
MV
766{
767 SCM buf;
768 size_t start = STRING_START (name);
769 size_t length = STRING_LENGTH (name);
770
771 if (IS_SH_STRING (name))
772 {
773 name = SH_STRING_STRING (name);
774 start += STRING_START (name);
775 }
1a4d7653 776 buf = STRING_STRINGBUF (name);
3ee86942
MV
777
778 if (start == 0 && length == STRINGBUF_LENGTH (buf))
779 {
780 /* reuse buf. */
9de87eea 781 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 782 SET_STRINGBUF_SHARED (buf);
9de87eea 783 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
784 }
785 else
786 {
787 /* make new buf. */
9c44cd45
MG
788 if (scm_i_is_narrow_string (name))
789 {
790 SCM new_buf = make_stringbuf (length);
791 memcpy (STRINGBUF_CHARS (new_buf),
792 STRINGBUF_CHARS (buf) + start, length);
793 buf = new_buf;
794 }
795 else
796 {
797 SCM new_buf = make_wide_stringbuf (length);
798 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
799 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
800 length);
801 buf = new_buf;
802 }
3ee86942 803 }
6869328b 804 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
805 (scm_t_bits) hash, SCM_UNPACK (props));
806}
807
fd0a5bbc
HWN
808SCM
809scm_i_c_make_symbol (const char *name, size_t len,
810 scm_t_bits flags, unsigned long hash, SCM props)
811{
812 SCM buf = make_stringbuf (len);
813 memcpy (STRINGBUF_CHARS (buf), name, len);
814
65619ebe
AW
815 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
816 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
817}
818
50b1996f
MG
819/* Returns the number of characters in SYM. This may be different
820 from the memory size of SYM. */
3ee86942
MV
821size_t
822scm_i_symbol_length (SCM sym)
0f2d19dd 823{
3ee86942 824 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
825}
826
071bb6a8
LC
827size_t
828scm_c_symbol_length (SCM sym)
829#define FUNC_NAME "scm_c_symbol_length"
830{
831 SCM_VALIDATE_SYMBOL (1, sym);
832
833 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
834}
835#undef FUNC_NAME
836
50b1996f
MG
837/* True if the name of SYM is stored as a Latin-1 encoded string.
838 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
839int
840scm_i_is_narrow_symbol (SCM sym)
841{
842 SCM buf;
843
844 buf = SYMBOL_STRINGBUF (sym);
845 return !STRINGBUF_WIDE (buf);
846}
847
50b1996f
MG
848/* Returns a pointer to the 8-bit Latin-1 encoded character array that
849 contains the name of SYM. */
3ee86942
MV
850const char *
851scm_i_symbol_chars (SCM sym)
852{
9c44cd45
MG
853 SCM buf;
854
855 buf = SYMBOL_STRINGBUF (sym);
856 if (!STRINGBUF_WIDE (buf))
f59cf998 857 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
858 else
859 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
860 scm_list_1 (sym));
861}
862
50b1996f
MG
863/* Return a pointer to the 32-bit UCS-4-encoded character array of a
864 symbol's name. */
9c44cd45
MG
865const scm_t_wchar *
866scm_i_symbol_wide_chars (SCM sym)
867{
868 SCM buf;
869
870 buf = SYMBOL_STRINGBUF (sym);
871 if (STRINGBUF_WIDE (buf))
f59cf998 872 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
873 else
874 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
875 scm_list_1 (sym));
3ee86942 876}
1cc91f1b 877
be54b15d 878SCM
3ee86942 879scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 880{
3ee86942 881 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 882 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 883 SET_STRINGBUF_SHARED (buf);
9de87eea 884 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 885 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
886 (scm_t_bits)start, (scm_t_bits) end - start);
887}
be54b15d 888
50b1996f 889/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
890scm_t_wchar
891scm_i_symbol_ref (SCM sym, size_t x)
892{
893 if (scm_i_is_narrow_symbol (sym))
894 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
895 else
896 return scm_i_symbol_wide_chars (sym)[x];
897}
898
3ee86942
MV
899/* Debugging
900 */
be54b15d 901
6ce6923b
MG
902SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
903 "Returns an association list containing debugging information\n"
904 "for @var{str}. The association list has the following entries."
905 "@table @code\n"
906 "@item string\n"
907 "The string itself.\n"
908 "@item start\n"
909 "The start index of the string into its stringbuf\n"
910 "@item length\n"
911 "The length of the string\n"
912 "@item shared\n"
913 "If this string is a substring, it returns its parent string.\n"
914 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
915 "@item read-only\n"
916 "@code{#t} if the string is read-only\n"
6ce6923b
MG
917 "@item stringbuf-chars\n"
918 "A new string containing this string's stringbuf's characters\n"
919 "@item stringbuf-length\n"
920 "The number of characters in this stringbuf\n"
921 "@item stringbuf-shared\n"
922 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
923 "@item stringbuf-wide\n"
924 "@code{#t} if this stringbuf's characters are stored in a\n"
925 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
926 "buffer\n"
927 "@end table")
3ee86942
MV
928#define FUNC_NAME s_scm_sys_string_dump
929{
ba54a202 930 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 931 SCM buf;
3ee86942 932 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
933
934 /* String info */
4a655e50 935 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 936 str);
4a655e50 937 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 938 scm_from_size_t (STRING_START (str)));
4a655e50 939 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
940 scm_from_size_t (STRING_LENGTH (str)));
941
3ee86942
MV
942 if (IS_SH_STRING (str))
943 {
4a655e50 944 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
945 SH_STRING_STRING (str));
946 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
947 }
948 else
949 {
4a655e50 950 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
951 SCM_BOOL_F);
952 buf = STRING_STRINGBUF (str);
3ee86942 953 }
9c44cd45 954
88ed5759 955 if (IS_RO_STRING (str))
4a655e50 956 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
957 SCM_BOOL_T);
958 else
4a655e50 959 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 960 SCM_BOOL_F);
587a3355 961
6ce6923b 962 /* Stringbuf info */
6ce6923b
MG
963 if (!STRINGBUF_WIDE (buf))
964 {
965 size_t len = STRINGBUF_LENGTH (buf);
966 char *cbuf;
190d4b0d 967 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 968 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 969 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 970 sbc);
3ee86942 971 }
6ce6923b
MG
972 else
973 {
974 size_t len = STRINGBUF_LENGTH (buf);
975 scm_t_wchar *cbuf;
190d4b0d 976 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
977 u32_cpy ((scm_t_uint32 *) cbuf,
978 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 979 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
980 sbc);
981 }
4a655e50 982 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
983 scm_from_size_t (STRINGBUF_LENGTH (buf)));
984 if (STRINGBUF_SHARED (buf))
4a655e50 985 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
986 SCM_BOOL_T);
987 else
4a655e50 988 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 989 SCM_BOOL_F);
6ce6923b 990 if (STRINGBUF_WIDE (buf))
4a655e50 991 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 992 SCM_BOOL_T);
6ce6923b 993 else
4a655e50 994 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 995 SCM_BOOL_F);
6ce6923b 996
ba54a202 997 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
998}
999#undef FUNC_NAME
1000
6ce6923b
MG
1001SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
1002 "Returns an association list containing debugging information\n"
1003 "for @var{sym}. The association list has the following entries."
1004 "@table @code\n"
1005 "@item symbol\n"
1006 "The symbol itself\n"
1007 "@item hash\n"
1008 "Its hash value\n"
88ed5759
MG
1009 "@item interned\n"
1010 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
1011 "@item stringbuf-chars\n"
1012 "A new string containing this symbols's stringbuf's characters\n"
1013 "@item stringbuf-length\n"
1014 "The number of characters in this stringbuf\n"
1015 "@item stringbuf-shared\n"
1016 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
1017 "@item stringbuf-wide\n"
1018 "@code{#t} if this stringbuf's characters are stored in a\n"
1019 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
1020 "buffer\n"
1021 "@end table")
3ee86942
MV
1022#define FUNC_NAME s_scm_sys_symbol_dump
1023{
ba54a202 1024 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 1025 SCM buf;
3ee86942 1026 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 1027 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 1028 sym);
4a655e50 1029 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 1030 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 1031 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 1032 scm_symbol_interned_p (sym));
6ce6923b
MG
1033 buf = SYMBOL_STRINGBUF (sym);
1034
1035 /* Stringbuf info */
6ce6923b
MG
1036 if (!STRINGBUF_WIDE (buf))
1037 {
1038 size_t len = STRINGBUF_LENGTH (buf);
1039 char *cbuf;
190d4b0d 1040 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 1041 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 1042 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1043 sbc);
1044 }
9c44cd45 1045 else
6ce6923b
MG
1046 {
1047 size_t len = STRINGBUF_LENGTH (buf);
1048 scm_t_wchar *cbuf;
190d4b0d 1049 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
1050 u32_cpy ((scm_t_uint32 *) cbuf,
1051 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 1052 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
1053 sbc);
1054 }
4a655e50 1055 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
1056 scm_from_size_t (STRINGBUF_LENGTH (buf)));
1057 if (STRINGBUF_SHARED (buf))
4a655e50 1058 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
1059 SCM_BOOL_T);
1060 else
4a655e50 1061 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 1062 SCM_BOOL_F);
6ce6923b 1063 if (STRINGBUF_WIDE (buf))
4a655e50 1064 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
1065 SCM_BOOL_T);
1066 else
4a655e50 1067 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 1068 SCM_BOOL_F);
ba54a202 1069 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 1070
3ee86942
MV
1071}
1072#undef FUNC_NAME
1073
56a3dcd4 1074#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1075
9c44cd45 1076SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1077#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1078{
1079 int i;
1080 for (i = 0; i < 1000; i++)
1081 if (lenhist[i])
1082 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1083 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1084 return SCM_UNSPECIFIED;
be54b15d
DH
1085}
1086#undef FUNC_NAME
1087
3ee86942
MV
1088#endif
1089
1090\f
1091
1092SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1093 (SCM obj),
1094 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1095#define FUNC_NAME s_scm_string_p
1096{
1097 return scm_from_bool (IS_STRING (obj));
1098}
1099#undef FUNC_NAME
1100
1101
1102SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1103
1104SCM_DEFINE (scm_string, "string", 0, 0, 1,
1105 (SCM chrs),
1106 "@deffnx {Scheme Procedure} list->string chrs\n"
1107 "Return a newly allocated string composed of the arguments,\n"
1108 "@var{chrs}.")
1109#define FUNC_NAME s_scm_string
1110{
9aa27c1a 1111 SCM result = SCM_BOOL_F;
9c44cd45 1112 SCM rest;
3ee86942 1113 size_t len;
9c44cd45
MG
1114 size_t p = 0;
1115 long i;
9aa27c1a 1116 int wide = 0;
3ee86942 1117
9c44cd45
MG
1118 /* Verify that this is a list of chars. */
1119 i = scm_ilength (chrs);
3c7cf7f5 1120 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1121
9c44cd45
MG
1122 len = (size_t) i;
1123 rest = chrs;
3ee86942 1124
9c44cd45 1125 while (len > 0 && scm_is_pair (rest))
3ee86942 1126 {
9c44cd45 1127 SCM elt = SCM_CAR (rest);
3ee86942 1128 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1129 if (SCM_CHAR (elt) > 0xFF)
1130 wide = 1;
9c44cd45
MG
1131 rest = SCM_CDR (rest);
1132 len--;
1133 scm_remember_upto_here_1 (elt);
1134 }
1135
1136 /* Construct a string containing this list of chars. */
1137 len = (size_t) i;
1138 rest = chrs;
1139
9aa27c1a 1140 if (wide == 0)
9c44cd45 1141 {
56a3dcd4
LC
1142 char *buf;
1143
190d4b0d 1144 result = scm_i_make_string (len, NULL, 0);
9aa27c1a 1145 result = scm_i_string_start_writing (result);
56a3dcd4 1146 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1147 while (len > 0 && scm_is_pair (rest))
1148 {
1149 SCM elt = SCM_CAR (rest);
1150 buf[p] = (unsigned char) SCM_CHAR (elt);
1151 p++;
1152 rest = SCM_CDR (rest);
1153 len--;
1154 scm_remember_upto_here_1 (elt);
1155 }
1156 }
1157 else
1158 {
56a3dcd4
LC
1159 scm_t_wchar *buf;
1160
190d4b0d 1161 result = scm_i_make_wide_string (len, NULL, 0);
9aa27c1a 1162 result = scm_i_string_start_writing (result);
56a3dcd4 1163 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1164 while (len > 0 && scm_is_pair (rest))
1165 {
1166 SCM elt = SCM_CAR (rest);
1167 buf[p] = SCM_CHAR (elt);
1168 p++;
1169 rest = SCM_CDR (rest);
1170 len--;
1171 scm_remember_upto_here_1 (elt);
1172 }
3ee86942 1173 }
9c44cd45
MG
1174 scm_i_string_stop_writing ();
1175
3ee86942
MV
1176 if (len > 0)
1177 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1178 if (!scm_is_null (rest))
3ee86942
MV
1179 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1180
1181 return result;
1182}
1183#undef FUNC_NAME
be54b15d 1184
3b3b36dd 1185SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1186 (SCM k, SCM chr),
0d26a824
MG
1187 "Return a newly allocated string of\n"
1188 "length @var{k}. If @var{chr} is given, then all elements of\n"
1189 "the string are initialized to @var{chr}, otherwise the contents\n"
b7e64f8b 1190 "of the string are all set to @code{#\nul}.")
1bbd0b84 1191#define FUNC_NAME s_scm_make_string
0f2d19dd 1192{
3ee86942
MV
1193 return scm_c_make_string (scm_to_size_t (k), chr);
1194}
1195#undef FUNC_NAME
1196
1197SCM
1198scm_c_make_string (size_t len, SCM chr)
1199#define FUNC_NAME NULL
1200{
9c44cd45 1201 size_t p;
3ef6650d 1202 char *contents = NULL;
190d4b0d 1203 SCM res = scm_i_make_string (len, &contents, 0);
cb0d8be2 1204
3ef6650d
AW
1205 /* If no char is given, initialize string contents to NULL. */
1206 if (SCM_UNBNDP (chr))
1207 memset (contents, 0, len);
1208 else
e11e83f3 1209 {
3ee86942 1210 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1211 res = scm_i_string_start_writing (res);
1212 for (p = 0; p < len; p++)
1213 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1214 scm_i_string_stop_writing ();
0f2d19dd 1215 }
e11e83f3
MV
1216
1217 return res;
0f2d19dd 1218}
1bbd0b84 1219#undef FUNC_NAME
0f2d19dd 1220
3b3b36dd 1221SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1222 (SCM string),
1223 "Return the number of characters in @var{string}.")
1bbd0b84 1224#define FUNC_NAME s_scm_string_length
0f2d19dd 1225{
d1ca2c64 1226 SCM_VALIDATE_STRING (1, string);
3ee86942 1227 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1228}
1bbd0b84 1229#undef FUNC_NAME
0f2d19dd 1230
f8ba2bb9 1231SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1232 (SCM string),
1233 "Return the bytes used to represent a character in @var{string}."
1234 "This will return 1 or 4.")
f8ba2bb9 1235#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1236{
1237 SCM_VALIDATE_STRING (1, string);
1238 if (!scm_i_is_narrow_string (string))
1239 return scm_from_int (4);
1240
1241 return scm_from_int (1);
1242}
1243#undef FUNC_NAME
1244
3ee86942
MV
1245size_t
1246scm_c_string_length (SCM string)
1247{
1248 if (!IS_STRING (string))
1249 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1250 return STRING_LENGTH (string);
1251}
1252
bd9e24b3 1253SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1254 (SCM str, SCM k),
9c44cd45
MG
1255 "Return character @var{k} of @var{str} using zero-origin\n"
1256 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1257#define FUNC_NAME s_scm_string_ref
0f2d19dd 1258{
3ae3166b 1259 size_t len;
a55c2b68 1260 unsigned long idx;
bd9e24b3 1261
d1ca2c64 1262 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1263
1264 len = scm_i_string_length (str);
1265 if (SCM_LIKELY (len > 0))
1266 idx = scm_to_unsigned_integer (k, 0, len - 1);
1267 else
1268 scm_out_of_range (NULL, k);
1269
9c44cd45
MG
1270 if (scm_i_is_narrow_string (str))
1271 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1272 else
1273 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1274}
1bbd0b84 1275#undef FUNC_NAME
0f2d19dd 1276
3ee86942
MV
1277SCM
1278scm_c_string_ref (SCM str, size_t p)
1279{
1280 if (p >= scm_i_string_length (str))
1281 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1282 if (scm_i_is_narrow_string (str))
1283 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1284 else
1285 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1286
3ee86942 1287}
f0942910 1288
3b3b36dd 1289SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1290 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1291 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1292 "an unspecified value. @var{k} must be a valid index of\n"
1293 "@var{str}.")
1bbd0b84 1294#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1295{
3ae3166b 1296 size_t len;
a55c2b68
MV
1297 unsigned long idx;
1298
f0942910 1299 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1300
1301 len = scm_i_string_length (str);
1302 if (SCM_LIKELY (len > 0))
1303 idx = scm_to_unsigned_integer (k, 0, len - 1);
1304 else
1305 scm_out_of_range (NULL, k);
1306
34d19ef6 1307 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1308 str = scm_i_string_start_writing (str);
1309 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1310 scm_i_string_stop_writing ();
1311
0f2d19dd
JB
1312 return SCM_UNSPECIFIED;
1313}
1bbd0b84 1314#undef FUNC_NAME
0f2d19dd 1315
3ee86942
MV
1316void
1317scm_c_string_set_x (SCM str, size_t p, SCM chr)
1318{
1319 if (p >= scm_i_string_length (str))
1320 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1321 str = scm_i_string_start_writing (str);
1322 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1323 scm_i_string_stop_writing ();
3ee86942 1324}
0f2d19dd 1325
3b3b36dd 1326SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1327 (SCM str, SCM start, SCM end),
1328 "Return a newly allocated string formed from the characters\n"
1329 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1330 "ending with index @var{end} (exclusive).\n"
1331 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1332 "exact integers satisfying:\n\n"
1333 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1334#define FUNC_NAME s_scm_substring
0f2d19dd 1335{
3ee86942 1336 size_t len, from, to;
685c0d71 1337
d1ca2c64 1338 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1339 len = scm_i_string_length (str);
1340 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1341 if (SCM_UNBNDP (end))
3ee86942 1342 to = len;
a55c2b68 1343 else
3ee86942
MV
1344 to = scm_to_unsigned_integer (end, from, len);
1345 return scm_i_substring (str, from, to);
0f2d19dd 1346}
1bbd0b84 1347#undef FUNC_NAME
0f2d19dd 1348
ed35de72
MV
1349SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1350 (SCM str, SCM start, SCM end),
1351 "Return a newly allocated string formed from the characters\n"
1352 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1353 "ending with index @var{end} (exclusive).\n"
1354 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1355 "exact integers satisfying:\n"
1356 "\n"
1357 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1358 "\n"
1359 "The returned string is read-only.\n")
1360#define FUNC_NAME s_scm_substring_read_only
1361{
1362 size_t len, from, to;
1363
1364 SCM_VALIDATE_STRING (1, str);
1365 len = scm_i_string_length (str);
1366 from = scm_to_unsigned_integer (start, 0, len);
1367 if (SCM_UNBNDP (end))
1368 to = len;
1369 else
1370 to = scm_to_unsigned_integer (end, from, len);
1371 return scm_i_substring_read_only (str, from, to);
1372}
1373#undef FUNC_NAME
1374
3ee86942
MV
1375SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1376 (SCM str, SCM start, SCM end),
1377 "Return a newly allocated string formed from the characters\n"
1378 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1379 "ending with index @var{end} (exclusive).\n"
1380 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1381 "exact integers satisfying:\n\n"
1382 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1383#define FUNC_NAME s_scm_substring_copy
1384{
e1b29f6a
MV
1385 /* For the Scheme version, START is mandatory, but for the C
1386 version, it is optional. See scm_string_copy in srfi-13.c for a
1387 rationale.
1388 */
1389
1390 size_t from, to;
3ee86942
MV
1391
1392 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1393 scm_i_get_substring_spec (scm_i_string_length (str),
1394 start, &from, end, &to);
3ee86942
MV
1395 return scm_i_substring_copy (str, from, to);
1396}
1397#undef FUNC_NAME
1398
1399SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1400 (SCM str, SCM start, SCM end),
1401 "Return string that indirectly refers to the characters\n"
1402 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1403 "ending with index @var{end} (exclusive).\n"
1404 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1405 "exact integers satisfying:\n\n"
1406 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1407#define FUNC_NAME s_scm_substring_shared
1408{
1409 size_t len, from, to;
1410
1411 SCM_VALIDATE_STRING (1, str);
1412 len = scm_i_string_length (str);
1413 from = scm_to_unsigned_integer (start, 0, len);
1414 if (SCM_UNBNDP (end))
1415 to = len;
1416 else
1417 to = scm_to_unsigned_integer (end, from, len);
1418 return scm_i_substring_shared (str, from, to);
1419}
1420#undef FUNC_NAME
685c0d71 1421
3b3b36dd 1422SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1423 (SCM args),
9c44cd45 1424 "Return a newly allocated string whose characters form the\n"
0d26a824 1425 "concatenation of the given strings, @var{args}.")
1bbd0b84 1426#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1427{
1428 SCM res;
2f13a466
MW
1429 size_t total = 0;
1430 size_t len;
9c44cd45 1431 int wide = 0;
c829a427 1432 SCM l, s;
bd4911ef 1433 size_t i;
9909c395
MG
1434 union
1435 {
1436 char *narrow;
1437 scm_t_wchar *wide;
1438 } data;
af45e3b0
DH
1439
1440 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1441 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1442 {
1443 s = SCM_CAR (l);
1444 SCM_VALIDATE_STRING (SCM_ARGn, s);
2f13a466
MW
1445 len = scm_i_string_length (s);
1446 if (((size_t) -1) - total < len)
1447 scm_num_overflow (s_scm_string_append);
1448 total += len;
9c44cd45
MG
1449 if (!scm_i_is_narrow_string (s))
1450 wide = 1;
c829a427 1451 }
9909c395 1452 data.narrow = NULL;
9c44cd45 1453 if (!wide)
2f13a466 1454 res = scm_i_make_string (total, &data.narrow, 0);
9c44cd45 1455 else
2f13a466 1456 res = scm_i_make_wide_string (total, &data.wide, 0);
9c44cd45
MG
1457
1458 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1459 {
edea856c 1460 size_t len;
c829a427 1461 s = SCM_CAR (l);
3ee86942 1462 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1463 len = scm_i_string_length (s);
2f13a466
MW
1464 if (len > total)
1465 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
9c44cd45
MG
1466 if (!wide)
1467 {
9909c395
MG
1468 memcpy (data.narrow, scm_i_string_chars (s), len);
1469 data.narrow += len;
9c44cd45
MG
1470 }
1471 else
1472 {
1473 if (scm_i_is_narrow_string (s))
1474 {
2f13a466
MW
1475 const char *src = scm_i_string_chars (s);
1476 for (i = 0; i < len; i++)
1477 data.wide[i] = (unsigned char) src[i];
9c44cd45
MG
1478 }
1479 else
9909c395 1480 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1481 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1482 data.wide += len;
9c44cd45 1483 }
2f13a466 1484 total -= len;
c829a427
MV
1485 scm_remember_upto_here_1 (s);
1486 }
2f13a466
MW
1487 if (total != 0)
1488 SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
0f2d19dd
JB
1489 return res;
1490}
1bbd0b84 1491#undef FUNC_NAME
0f2d19dd 1492
24933780 1493
a3d7d5d5 1494\f
c62da8f8 1495/* Charset conversion error handling. */
a3d7d5d5
LC
1496
1497SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
c62da8f8
LC
1498SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1499
6851d3be
LC
1500/* Raise an exception informing that character CHR could not be written
1501 to PORT in its current encoding. */
d14418a5 1502void
ef7e4ba3 1503scm_encoding_error (const char *subr, int err, const char *message,
6851d3be 1504 SCM port, SCM chr)
ef7e4ba3 1505{
ef7e4ba3 1506 scm_throw (scm_encoding_error_key,
d050ef66
AW
1507 scm_list_n (scm_from_latin1_string (subr),
1508 scm_from_latin1_string (message),
ef7e4ba3 1509 scm_from_int (err),
6851d3be 1510 port, chr,
ef7e4ba3 1511 SCM_UNDEFINED));
a3d7d5d5
LC
1512}
1513
c62da8f8
LC
1514/* Raise an exception informing of an encoding error on PORT. This
1515 means that a character could not be written in PORT's encoding. */
1516void
1517scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1518{
1519 scm_throw (scm_decoding_error_key,
d050ef66
AW
1520 scm_list_n (scm_from_latin1_string (subr),
1521 scm_from_latin1_string (message),
c62da8f8
LC
1522 scm_from_int (err),
1523 port,
1524 SCM_UNDEFINED));
1525}
1526
1527\f
1528/* String conversion to/from C. */
1529
41d1d984
AW
1530static void
1531decoding_error (const char *func_name, int errno_save,
1532 const char *str, size_t len)
1533{
1534 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1535 handler. */
1536 SCM bv;
1537 signed char *buf;
1538
1539 buf = scm_gc_malloc_pointerless (len, "bytevector");
1540 memcpy (buf, str, len);
1541 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1542
1543 scm_decoding_error (func_name, errno_save,
1544 "input locale conversion error", bv);
1545}
1546
fac32b51 1547SCM
587a3355
MG
1548scm_from_stringn (const char *str, size_t len, const char *encoding,
1549 scm_t_string_failed_conversion_handler handler)
1550{
1551 size_t u32len, i;
1552 scm_t_wchar *u32;
1553 int wide = 0;
1554 SCM res;
1555
d40e1ca8 1556 /* The order of these checks is important. */
a574564c 1557 if (!str && len != 0)
d40e1ca8
AW
1558 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1559 if (len == (size_t) -1)
1560 len = strlen (str);
fac32b51 1561
f6f4feb0 1562 if (c_strcasecmp (encoding, "ISO-8859-1") == 0 || len == 0)
098818a1 1563 return scm_from_latin1_stringn (str, len);
f6f4feb0 1564 else if (c_strcasecmp (encoding, "UTF-8") == 0
8c76a897 1565 && handler == SCM_FAILED_CONVERSION_ERROR)
098818a1 1566 return scm_from_utf8_stringn (str, len);
889975e5 1567
587a3355
MG
1568 u32len = 0;
1569 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1570 (enum iconv_ilseq_handler)
1571 handler,
1572 str, len,
1573 NULL,
1574 NULL, &u32len);
1575
ef7e4ba3 1576 if (SCM_UNLIKELY (u32 == NULL))
41d1d984 1577 decoding_error (__func__, errno, str, len);
587a3355
MG
1578
1579 i = 0;
1580 while (i < u32len)
1581 if (u32[i++] > 0xFF)
1582 {
1583 wide = 1;
1584 break;
1585 }
1586
1587 if (!wide)
1588 {
1589 char *dst;
190d4b0d 1590 res = scm_i_make_string (u32len, &dst, 0);
587a3355
MG
1591 for (i = 0; i < u32len; i ++)
1592 dst[i] = (unsigned char) u32[i];
1593 dst[u32len] = '\0';
1594 }
1595 else
1596 {
1597 scm_t_wchar *wdst;
190d4b0d 1598 res = scm_i_make_wide_string (u32len, &wdst, 0);
587a3355
MG
1599 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1600 wdst[u32len] = 0;
1601 }
1602
1603 free (u32);
1604 return res;
1605}
1606
cf313a94 1607SCM
d40e1ca8 1608scm_from_locale_string (const char *str)
cf313a94 1609{
d40e1ca8 1610 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1611}
1612
c829a427
MV
1613SCM
1614scm_from_locale_stringn (const char *str, size_t len)
1615{
95f5e303 1616 return scm_from_stringn (str, len, locale_charset (),
b22e94db 1617 scm_i_default_port_conversion_handler ());
c829a427 1618}
4d4528e7 1619
c829a427 1620SCM
d40e1ca8 1621scm_from_latin1_string (const char *str)
4d4528e7 1622{
d40e1ca8
AW
1623 return scm_from_latin1_stringn (str, -1);
1624}
9c44cd45 1625
d40e1ca8
AW
1626SCM
1627scm_from_latin1_stringn (const char *str, size_t len)
1628{
e9a35a96
LC
1629 char *buf;
1630 SCM result;
1631
1632 if (len == (size_t) -1)
1633 len = strlen (str);
1634
1635 /* Make a narrow string and copy STR as is. */
190d4b0d 1636 result = scm_i_make_string (len, &buf, 0);
e9a35a96
LC
1637 memcpy (buf, str, len);
1638
1639 return result;
c829a427 1640}
4d4528e7 1641
587a3355 1642SCM
d40e1ca8 1643scm_from_utf8_string (const char *str)
587a3355 1644{
d40e1ca8
AW
1645 return scm_from_utf8_stringn (str, -1);
1646}
1647
1648SCM
1649scm_from_utf8_stringn (const char *str, size_t len)
1650{
41d1d984
AW
1651 size_t i, char_len;
1652 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1653 int ascii = 1, narrow = 1;
1654 SCM res;
1655
1656 if (len == (size_t) -1)
1657 len = strlen (str);
1658
1659 i = 0;
1660 char_len = 0;
1661
1662 while (i < len)
1663 {
1664 if (ustr[i] <= 127)
1665 {
1666 char_len++;
1667 i++;
1668 }
1669 else
1670 {
1671 ucs4_t c;
1672 int nbytes;
1673
1674 ascii = 0;
1675
1676 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1677
8c76a897 1678 if (c == 0xfffd)
41d1d984
AW
1679 /* Bad UTF-8. */
1680 decoding_error (__func__, errno, str, len);
1681
1682 if (c > 255)
1683 narrow = 0;
1684
1685 char_len++;
1686 i += nbytes;
1687 }
1688 }
1689
1690 if (ascii)
1691 {
1692 char *dst;
1693 res = scm_i_make_string (char_len, &dst, 0);
1694 memcpy (dst, str, len);
1695 }
1696 else if (narrow)
1697 {
1698 char *dst;
1699 size_t j;
1700 ucs4_t c;
1701
1702 res = scm_i_make_string (char_len, &dst, 0);
1703
33aadcab 1704 for (i = 0, j = 0; i < len; j++)
41d1d984
AW
1705 {
1706 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1707 dst[j] = (signed char) c;
1708 }
1709 }
1710 else
1711 {
1712 scm_t_wchar *dst;
1713 size_t j;
1714 ucs4_t c;
1715
1716 res = scm_i_make_wide_string (char_len, &dst, 0);
1717
33aadcab 1718 for (i = 0, j = 0; i < len; j++)
41d1d984
AW
1719 {
1720 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1721 dst[j] = c;
1722 }
1723 }
1724
1725 return res;
587a3355
MG
1726}
1727
647dc1ac
LC
1728SCM
1729scm_from_utf32_string (const scm_t_wchar *str)
1730{
1731 return scm_from_utf32_stringn (str, -1);
1732}
1733
1734SCM
1735scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1736{
1737 SCM result;
1738 scm_t_wchar *buf;
1739
1740 if (len == (size_t) -1)
1741 len = u32_strlen ((uint32_t *) str);
1742
190d4b0d 1743 result = scm_i_make_wide_string (len, &buf, 0);
647dc1ac
LC
1744 memcpy (buf, str, len * sizeof (scm_t_wchar));
1745 scm_i_try_narrow_string (result);
1746
1747 return result;
1748}
1749
08467a7e
AW
1750SCM
1751scm_from_port_string (const char *str, SCM port)
1752{
1753 return scm_from_port_stringn (str, -1, port);
1754}
1755
1756SCM
1757scm_from_port_stringn (const char *str, size_t len, SCM port)
1758{
1759 scm_t_port *pt = SCM_PTAB_ENTRY (port);
f6f4feb0 1760 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
08467a7e 1761
f6f4feb0 1762 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
08467a7e 1763 return scm_from_latin1_stringn (str, len);
f6f4feb0 1764 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
25752c4d
AW
1765 && (pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR
1766 || (u8_check ((uint8_t *) str, len) == NULL)))
08467a7e
AW
1767 return scm_from_utf8_stringn (str, len);
1768 else
1769 return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
1770}
1771
50b1996f
MG
1772/* Create a new scheme string from the C string STR. The memory of
1773 STR may be used directly as storage for the new string. */
13a94556
LC
1774/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1775 would be to register a finalizer to eventually free(3) STR, which isn't
1776 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1777SCM
1778scm_take_locale_stringn (char *str, size_t len)
1779{
13a94556 1780 SCM res;
48ddf0d9 1781
13a94556
LC
1782 res = scm_from_locale_stringn (str, len);
1783 free (str);
c829a427 1784
c829a427
MV
1785 return res;
1786}
1787
48ddf0d9
KR
1788SCM
1789scm_take_locale_string (char *str)
1790{
1791 return scm_take_locale_stringn (str, -1);
1792}
1793
f1ee6d54
LC
1794/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1795 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1796 Set *LENP to the size of the resulting string.
1797
1798 FIXME: This is a hack we should get rid of. See
1799 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1800 for details. */
1801static void
1802unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1803{
1804 char *before, *after;
1805 size_t i, j;
1806
4ff2b9f4
LC
1807 before = buf;
1808 after = buf;
9c44cd45
MG
1809 i = 0;
1810 j = 0;
1811 while (i < *lenp)
1812 {
1813 if ((i <= *lenp - 6)
1814 && before[i] == '\\'
1815 && before[i + 1] == 'u'
1816 && before[i + 2] == '0' && before[i + 3] == '0')
1817 {
1818 /* Convert \u00NN to \xNN */
1819 after[j] = '\\';
1820 after[j + 1] = 'x';
30a6b9ca
MG
1821 after[j + 2] = tolower ((int) before[i + 4]);
1822 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1823 i += 6;
1824 j += 4;
1825 }
1826 else if ((i <= *lenp - 10)
1827 && before[i] == '\\'
1828 && before[i + 1] == 'U'
1829 && before[i + 2] == '0' && before[i + 3] == '0')
1830 {
1831 /* Convert \U00NNNNNN to \UNNNNNN */
1832 after[j] = '\\';
1833 after[j + 1] = 'U';
30a6b9ca
MG
1834 after[j + 2] = tolower ((int) before[i + 4]);
1835 after[j + 3] = tolower ((int) before[i + 5]);
1836 after[j + 4] = tolower ((int) before[i + 6]);
1837 after[j + 5] = tolower ((int) before[i + 7]);
1838 after[j + 6] = tolower ((int) before[i + 8]);
1839 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1840 i += 10;
1841 j += 8;
1842 }
1843 else
1844 {
1845 after[j] = before[i];
1846 i++;
1847 j++;
1848 }
1849 }
1850 *lenp = j;
9c44cd45
MG
1851}
1852
f1ee6d54
LC
1853/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1854 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1855 of the resulting string. BUF must be large enough to handle the
1856 worst case when `\uXXXX' escapes (6 characters) are replaced by
1857 `\xXXXX;' (7 characters). */
31d4d02b
LC
1858static void
1859unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1860{
1861 char *before, *after;
1862 size_t i, j;
1863 /* The worst case is if the input string contains all 4-digit hex escapes.
1864 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1865 size_t max_out_len = (*lenp * 7) / 6 + 1;
1866 size_t nzeros, ndigits;
1867
4ff2b9f4 1868 before = buf;
d31b9519
MG
1869 after = alloca (max_out_len);
1870 i = 0;
1871 j = 0;
1872 while (i < *lenp)
1873 {
1874 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1875 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1876 {
1877 if (before[i + 1] == 'u')
1878 ndigits = 4;
1879 else if (before[i + 1] == 'U')
1880 ndigits = 8;
1881 else
1882 abort ();
1883
1884 /* Add the R6RS hex escape initial sequence. */
1885 after[j] = '\\';
1886 after[j + 1] = 'x';
1887
1888 /* Move string positions to the start of the hex numbers. */
1889 i += 2;
1890 j += 2;
1891
1892 /* Find the number of initial zeros in this hex number. */
1893 nzeros = 0;
1894 while (before[i + nzeros] == '0' && nzeros < ndigits)
1895 nzeros++;
1896
1897 /* Copy the number, skipping initial zeros, and then move the string
1898 positions. */
1899 if (nzeros == ndigits)
1900 {
1901 after[j] = '0';
1902 i += ndigits;
1903 j += 1;
1904 }
1905 else
1906 {
1907 int pos;
1908 for (pos = 0; pos < ndigits - nzeros; pos++)
1909 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1910 i += ndigits;
1911 j += (ndigits - nzeros);
1912 }
1913
1914 /* Add terminating semicolon. */
1915 after[j] = ';';
1916 j++;
1917 }
1918 else
1919 {
1920 after[j] = before[i];
1921 i++;
1922 j++;
1923 }
1924 }
1925 *lenp = j;
d31b9519
MG
1926 memcpy (before, after, j);
1927}
1928
cf313a94 1929char *
d40e1ca8 1930scm_to_locale_string (SCM str)
cf313a94 1931{
d40e1ca8 1932 return scm_to_locale_stringn (str, NULL);
cf313a94 1933}
d31b9519 1934
c829a427 1935char *
fac32b51 1936scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1937{
b22e94db 1938 return scm_to_stringn (str, lenp,
95f5e303 1939 locale_charset (),
b22e94db 1940 scm_i_default_port_conversion_handler ());
9c44cd45
MG
1941}
1942
d40e1ca8
AW
1943char *
1944scm_to_latin1_string (SCM str)
1945{
1946 return scm_to_latin1_stringn (str, NULL);
1947}
1948
1949char *
1950scm_to_latin1_stringn (SCM str, size_t *lenp)
e9a35a96 1951#define FUNC_NAME "scm_to_latin1_stringn"
d40e1ca8 1952{
e9a35a96
LC
1953 char *result;
1954
1955 SCM_VALIDATE_STRING (1, str);
1956
1957 if (scm_i_is_narrow_string (str))
1958 {
fe133640
AW
1959 size_t len = scm_i_string_length (str);
1960
e9a35a96 1961 if (lenp)
fe133640 1962 *lenp = len;
e9a35a96 1963
fe133640 1964 result = scm_strndup (scm_i_string_data (str), len);
e9a35a96
LC
1965 }
1966 else
1967 result = scm_to_stringn (str, lenp, NULL,
fe133640 1968 SCM_FAILED_CONVERSION_ERROR);
e9a35a96
LC
1969
1970 return result;
d40e1ca8 1971}
e9a35a96 1972#undef FUNC_NAME
d40e1ca8
AW
1973
1974char *
1975scm_to_utf8_string (SCM str)
1976{
1977 return scm_to_utf8_stringn (str, NULL);
1978}
1979
e3d45974
AW
1980static size_t
1981latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
1982{
1983 size_t ret, i;
1984 for (i = 0, ret = 0; i < len; i++)
1985 ret += (str[i] < 128) ? 1 : 2;
1986 return ret;
1987}
1988
1989static scm_t_uint8*
1990latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
1991 scm_t_uint8 *u8_result, size_t *u8_lenp)
1992{
1993 size_t i, n;
1994 size_t u8_len = latin1_u8_strlen (str, latin_len);
1995
1996 if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
1997 u8_result = scm_malloc (u8_len + 1);
1998 if (u8_lenp)
1999 *u8_lenp = u8_len;
2000
2001 for (i = 0, n = 0; i < latin_len; i++)
2002 n += u8_uctomb (u8_result + n, str[i], u8_len - n);
2003 if (n != u8_len)
2004 abort ();
2005 u8_result[n] = 0;
2006
2007 return u8_result;
2008}
2009
e26da7a2
MW
2010/* UTF-8 code table
2011
2012 (Note that this includes code points that are not allowed by Unicode,
2013 but since this function has no way to report an error, and its
2014 purpose is to determine the size of destination buffers for
2015 libunicode conversion functions, we err on the safe side and handle
2016 everything that libunicode might conceivably handle, now or in the
2017 future.)
2c1b7951
AW
2018
2019 Char. number range | UTF-8 octet sequence
2020 (hexadecimal) | (binary)
e26da7a2 2021 --------------------+------------------------------------------------------
2c1b7951
AW
2022 0000 0000-0000 007F | 0xxxxxxx
2023 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
2024 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
e26da7a2
MW
2025 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
2026 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2027 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
2c1b7951
AW
2028*/
2029
2030static size_t
e26da7a2 2031u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
2c1b7951
AW
2032{
2033 size_t ret, i;
2034
2035 for (i = 0, ret = 0; i < len; i++)
2036 {
2037 scm_t_uint32 c = str[i];
2038
2039 if (c <= 0x7f)
2040 ret += 1;
2041 else if (c <= 0x7ff)
2042 ret += 2;
2043 else if (c <= 0xffff)
2044 ret += 3;
e26da7a2 2045 else if (c <= 0x1fffff)
2c1b7951 2046 ret += 4;
e26da7a2
MW
2047 else if (c <= 0x3ffffff)
2048 ret += 5;
2049 else
2050 ret += 6;
2c1b7951
AW
2051 }
2052
2053 return ret;
2054}
2055
d40e1ca8
AW
2056char *
2057scm_to_utf8_stringn (SCM str, size_t *lenp)
ee26a9eb 2058#define FUNC_NAME "scm_to_utf8_stringn"
d40e1ca8 2059{
ee26a9eb
AW
2060 SCM_VALIDATE_STRING (1, str);
2061
e3d45974
AW
2062 if (scm_i_is_narrow_string (str))
2063 return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
2064 scm_i_string_length (str),
2065 NULL, lenp);
2066 else
2c1b7951 2067 {
bbb9f000 2068 scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
2c1b7951 2069 scm_t_uint8 *buf, *ret;
bbb9f000
MW
2070 size_t num_chars = scm_i_string_length (str);
2071 size_t num_bytes_predicted, num_bytes_actual;
2c1b7951 2072
bbb9f000 2073 num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
2c1b7951 2074
e26da7a2 2075 if (lenp)
2c1b7951 2076 {
bbb9f000
MW
2077 *lenp = num_bytes_predicted;
2078 buf = scm_malloc (num_bytes_predicted);
2c1b7951 2079 }
e26da7a2
MW
2080 else
2081 {
bbb9f000
MW
2082 buf = scm_malloc (num_bytes_predicted + 1);
2083 buf[num_bytes_predicted] = 0;
e26da7a2
MW
2084 }
2085
bbb9f000
MW
2086 num_bytes_actual = num_bytes_predicted;
2087 ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
e26da7a2 2088
bbb9f000 2089 if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
e26da7a2 2090 return (char *) ret;
2c1b7951
AW
2091
2092 /* An error: a bad codepoint. */
2093 {
2094 int saved_errno = errno;
2095
2096 free (buf);
2097 if (!saved_errno)
2098 abort ();
2099
2100 scm_decoding_error ("scm_to_utf8_stringn", errno,
2101 "invalid codepoint in string", str);
2102
2103 /* Not reached. */
2104 return NULL;
2105 }
2106 }
d40e1ca8 2107}
ee26a9eb 2108#undef FUNC_NAME
d40e1ca8 2109
647dc1ac
LC
2110scm_t_wchar *
2111scm_to_utf32_string (SCM str)
2112{
2113 return scm_to_utf32_stringn (str, NULL);
2114}
2115
2116scm_t_wchar *
2117scm_to_utf32_stringn (SCM str, size_t *lenp)
2118#define FUNC_NAME "scm_to_utf32_stringn"
2119{
2120 scm_t_wchar *result;
2121
2122 SCM_VALIDATE_STRING (1, str);
2123
2124 if (scm_i_is_narrow_string (str))
e3d45974
AW
2125 {
2126 scm_t_uint8 *codepoints;
2127 size_t i, len;
2128
2129 codepoints = (scm_t_uint8*) scm_i_string_chars (str);
2130 len = scm_i_string_length (str);
2131 if (lenp)
2132 *lenp = len;
2133
2134 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2135 for (i = 0; i < len; i++)
2136 result[i] = codepoints[i];
2137 result[len] = 0;
2138 }
647dc1ac
LC
2139 else
2140 {
2141 size_t len;
2142
2143 len = scm_i_string_length (str);
2144 if (lenp)
2145 *lenp = len;
2146
2147 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
2148 memcpy (result, scm_i_string_wide_chars (str),
2149 len * sizeof (scm_t_wchar));
2150 result[len] = 0;
2151 }
2152
2153 return result;
2154}
2155#undef FUNC_NAME
2156
08467a7e
AW
2157char *
2158scm_to_port_string (SCM str, SCM port)
2159{
2160 return scm_to_port_stringn (str, NULL, port);
2161}
2162
2163char *
2164scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
2165{
2166 scm_t_port *pt = SCM_PTAB_ENTRY (port);
f6f4feb0 2167 scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
08467a7e 2168
f6f4feb0 2169 if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
08467a7e
AW
2170 && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
2171 return scm_to_latin1_stringn (str, lenp);
f6f4feb0 2172 else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
08467a7e
AW
2173 return scm_to_utf8_stringn (str, lenp);
2174 else
2175 return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
2176}
2177
29bcdbb0
LC
2178/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
2179 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
2180 the returned buffer. If the conversion to ENCODING fails, apply the strategy
2181 defined by HANDLER. */
9c44cd45 2182char *
587a3355 2183scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 2184 scm_t_string_failed_conversion_handler handler)
9c44cd45 2185{
9c44cd45
MG
2186 char *buf;
2187 size_t ilen, len, i;
889975e5
MG
2188 int ret;
2189 const char *enc;
4d4528e7 2190
3ee86942 2191 if (!scm_is_string (str))
c829a427 2192 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2193 ilen = scm_i_string_length (str);
2194
2195 if (ilen == 0)
2196 {
2197 buf = scm_malloc (1);
2198 buf[0] = '\0';
2199 if (lenp)
2200 *lenp = 0;
2201 return buf;
2202 }
587a3355 2203
c829a427 2204 if (lenp == NULL)
9c44cd45
MG
2205 for (i = 0; i < ilen; i++)
2206 if (scm_i_string_ref (str, i) == '\0')
2207 scm_misc_error (NULL,
2208 "string contains #\\nul character: ~S",
2209 scm_list_1 (str));
2210
f6f4feb0
MW
2211 if (scm_i_is_narrow_string (str)
2212 && c_strcasecmp (encoding, "ISO-8859-1") == 0)
c829a427 2213 {
889975e5
MG
2214 /* If using native Latin-1 encoding, just copy the string
2215 contents. */
9c44cd45
MG
2216 if (lenp)
2217 {
2218 buf = scm_malloc (ilen);
2219 memcpy (buf, scm_i_string_chars (str), ilen);
2220 *lenp = ilen;
2221 return buf;
2222 }
2223 else
2224 {
2225 buf = scm_malloc (ilen + 1);
2226 memcpy (buf, scm_i_string_chars (str), ilen);
2227 buf[ilen] = '\0';
2228 return buf;
2229 }
c829a427 2230 }
9c44cd45 2231
587a3355 2232
9c44cd45
MG
2233 buf = NULL;
2234 len = 0;
889975e5
MG
2235 enc = encoding;
2236 if (enc == NULL)
2237 enc = "ISO-8859-1";
2238 if (scm_i_is_narrow_string (str))
2239 {
2240 ret = mem_iconveh (scm_i_string_chars (str), ilen,
2241 "ISO-8859-1", enc,
2242 (enum iconv_ilseq_handler) handler, NULL,
2243 &buf, &len);
9c44cd45 2244
889975e5 2245 if (ret != 0)
ef7e4ba3 2246 scm_encoding_error (__func__, errno,
6851d3be
LC
2247 "cannot convert narrow string to output locale",
2248 SCM_BOOL_F,
2249 /* FIXME: Faulty character unknown. */
2250 SCM_BOOL_F);
889975e5
MG
2251 }
2252 else
2253 {
d31b9519 2254 buf = u32_conv_to_encoding (enc,
889975e5 2255 (enum iconv_ilseq_handler) handler,
d31b9519 2256 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
2257 ilen,
2258 NULL,
2259 NULL, &len);
2260 if (buf == NULL)
ef7e4ba3 2261 scm_encoding_error (__func__, errno,
6851d3be
LC
2262 "cannot convert wide string to output locale",
2263 SCM_BOOL_F,
2264 /* FIXME: Faulty character unknown. */
2265 SCM_BOOL_F);
d31b9519
MG
2266 }
2267 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2268 {
2269 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
2270 {
2271 /* The worst case is if the input string contains all 4-digit
2272 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2273 (seven characters). Make BUF large enough to hold
2274 that. */
2275 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 2276 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 2277 }
d31b9519 2278 else
31d4d02b 2279 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
2280
2281 buf = scm_realloc (buf, len);
889975e5 2282 }
9c44cd45 2283 if (lenp)
4d4528e7 2284 *lenp = len;
9c44cd45
MG
2285 else
2286 {
2287 buf = scm_realloc (buf, len + 1);
2288 buf[len] = '\0';
2289 }
24933780 2290
c829a427 2291 scm_remember_upto_here_1 (str);
9c44cd45 2292 return buf;
4d4528e7 2293}
af68e5e5 2294
c829a427
MV
2295size_t
2296scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2297{
2298 size_t len;
9c44cd45 2299 char *result = NULL;
3ee86942 2300 if (!scm_is_string (str))
c829a427 2301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2302 result = scm_to_locale_stringn (str, &len);
2303
2304 memcpy (buf, result, (len > max_len) ? max_len : len);
2305 free (result);
2306
c829a427
MV
2307 scm_remember_upto_here_1 (str);
2308 return len;
2309}
af68e5e5 2310
a3d7d5d5
LC
2311\f
2312/* Unicode string normalization. */
2313
edb7bb47
JG
2314/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2315 libguile/i18n.c. It would be useful to have this factored out into a more
2316 convenient location, but its use of alloca makes that tricky to do. */
2317
2318static SCM
2319normalize_str (SCM string, uninorm_t form)
2320{
2321 SCM ret;
2322 scm_t_uint32 *w_str;
2323 scm_t_wchar *cbuf;
2324 size_t rlen, len = scm_i_string_length (string);
2325
2326 if (scm_i_is_narrow_string (string))
2327 {
2328 size_t i;
2329 const char *buf = scm_i_string_chars (string);
2330
2331 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2332
2333 for (i = 0; i < len; i ++)
2334 w_str[i] = (unsigned char) buf[i];
2335 w_str[len] = 0;
2336 }
d8164b04
JG
2337 else
2338 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2339
edb7bb47
JG
2340 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2341
190d4b0d 2342 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
edb7bb47
JG
2343 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2344 free (w_str);
d8164b04
JG
2345
2346 scm_i_try_narrow_string (ret);
2347
edb7bb47
JG
2348 return ret;
2349}
2350
2351SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2352 (SCM string),
2353 "Returns the NFC normalized form of @var{string}.")
2354#define FUNC_NAME s_scm_string_normalize_nfc
2355{
2356 SCM_VALIDATE_STRING (1, string);
2357 return normalize_str (string, UNINORM_NFC);
2358}
2359#undef FUNC_NAME
2360
2361SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2362 (SCM string),
2363 "Returns the NFD normalized form of @var{string}.")
2364#define FUNC_NAME s_scm_string_normalize_nfd
2365{
2366 SCM_VALIDATE_STRING (1, string);
2367 return normalize_str (string, UNINORM_NFD);
2368}
2369#undef FUNC_NAME
2370
2371SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2372 (SCM string),
2373 "Returns the NFKC normalized form of @var{string}.")
2374#define FUNC_NAME s_scm_string_normalize_nfkc
2375{
2376 SCM_VALIDATE_STRING (1, string);
2377 return normalize_str (string, UNINORM_NFKC);
2378}
2379#undef FUNC_NAME
2380
2381SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2382 (SCM string),
2383 "Returns the NFKD normalized form of @var{string}.")
2384#define FUNC_NAME s_scm_string_normalize_nfkd
2385{
2386 SCM_VALIDATE_STRING (1, string);
2387 return normalize_str (string, UNINORM_NFKD);
2388}
2389#undef FUNC_NAME
2390
7505c6e0
MW
2391/* converts C scm_array of strings to SCM scm_list of strings.
2392 If argc < 0, a null terminated scm_array is assumed.
2393 The current locale encoding is assumed */
9c44cd45 2394SCM
3ee86942
MV
2395scm_makfromstrs (int argc, char **argv)
2396{
2397 int i = argc;
2398 SCM lst = SCM_EOL;
2399 if (0 > i)
2400 for (i = 0; argv[i]; i++);
2401 while (i--)
2402 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2403 return lst;
2404}
2405
c829a427 2406/* Return a newly allocated array of char pointers to each of the strings
7505c6e0
MW
2407 in args, with a terminating NULL pointer. The strings are encoded using
2408 the current locale. */
c829a427
MV
2409
2410char **
2411scm_i_allocate_string_pointers (SCM list)
2a776823 2412#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 2413{
c829a427 2414 char **result;
7505c6e0 2415 int list_len = scm_ilength (list);
c829a427
MV
2416 int i;
2417
7505c6e0 2418 if (list_len < 0)
c829a427
MV
2419 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2420
7505c6e0 2421 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2a776823 2422 "string pointers");
7505c6e0 2423 result[list_len] = NULL;
c829a427 2424
7505c6e0 2425 /* The list might have been modified in another thread, so
c829a427
MV
2426 we check LIST before each access.
2427 */
7505c6e0 2428 for (i = 0; i < list_len && scm_is_pair (list); i++)
c829a427 2429 {
7505c6e0
MW
2430 SCM str = SCM_CAR (list);
2431 size_t len; /* String length in bytes */
2432 char *c_str = scm_to_locale_stringn (str, &len);
2433
2434 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2435 scm_malloc to allocate the returned string, which must be
2436 explicitly deallocated. This forces us to copy the string a
2437 second time into a new buffer. Ideally there would be variants
2438 of scm_to_*_stringn that can return garbage-collected buffers. */
2439
2440 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2441 memcpy (result[i], c_str, len);
2a776823 2442 result[i][len] = '\0';
7505c6e0 2443 free (c_str);
2a776823 2444
c829a427
MV
2445 list = SCM_CDR (list);
2446 }
2447
c829a427 2448 return result;
af68e5e5 2449}
2a776823 2450#undef FUNC_NAME
24933780 2451
6f14f578
MV
2452void
2453scm_i_get_substring_spec (size_t len,
2454 SCM start, size_t *cstart,
2455 SCM end, size_t *cend)
2456{
2457 if (SCM_UNBNDP (start))
2458 *cstart = 0;
2459 else
2460 *cstart = scm_to_unsigned_integer (start, 0, len);
2461
2462 if (SCM_UNBNDP (end))
2463 *cend = len;
2464 else
2465 *cend = scm_to_unsigned_integer (end, *cstart, len);
2466}
2467
f65e0168 2468SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2469
0f2d19dd
JB
2470void
2471scm_init_strings ()
0f2d19dd 2472{
a7e392c1 2473 scm_nullstr = scm_i_make_string (0, NULL, 0);
7c33806a 2474
a0599745 2475#include "libguile/strings.x"
0f2d19dd
JB
2476}
2477
89e00824
ML
2478
2479/*
2480 Local Variables:
2481 c-file-style: "gnu"
2482 End:
2483*/