Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / libguile / strings.c
CommitLineData
d40e1ca8 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
cdd47ec7 25#include <alloca.h>
faf2c9d7 26#include <string.h>
3ee86942 27#include <stdio.h>
9c44cd45 28#include <ctype.h>
edb7bb47 29#include <uninorm.h>
9c44cd45 30#include <unistr.h>
eca29b02 31#include <uniconv.h>
faf2c9d7 32
889975e5
MG
33#include "striconveh.h"
34
a0599745
MD
35#include "libguile/_scm.h"
36#include "libguile/chars.h"
7c33806a 37#include "libguile/root.h"
a0599745 38#include "libguile/strings.h"
a3d7d5d5 39#include "libguile/error.h"
f45eccff 40#include "libguile/generalized-vectors.h"
1afff620 41#include "libguile/deprecation.h"
a0599745 42#include "libguile/validate.h"
d31b9519 43#include "libguile/private-options.h"
1afff620 44
0f2d19dd
JB
45\f
46
47/* {Strings}
48 */
49
3ee86942
MV
50
51/* Stringbufs
52 *
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 55 * shared, not whether it definitely is.
3ee86942
MV
56 *
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
63 *
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
50b1996f 68 *
ba54a202
LC
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
71 * strings.
3ee86942
MV
72 */
73
ba54a202
LC
74/* The size in words of the stringbuf header (type tag + size). */
75#define STRINGBUF_HEADER_SIZE 2U
76
77#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
78
35920c00 79#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 80#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
81
82#define STRINGBUF_TAG scm_tc7_stringbuf
83#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 84#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 85
100e20c7 86#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
100e20c7
LC
89#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 91
100e20c7 92#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 93
9b41542f
LC
94#define SET_STRINGBUF_SHARED(buf) \
95 do \
96 { \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
101 } \
102 while (0)
3ee86942 103
56a3dcd4 104#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
105static size_t lenhist[1001];
106#endif
107
50b1996f
MG
108/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
109 characters. */
3ee86942
MV
110static SCM
111make_stringbuf (size_t len)
0f2d19dd 112{
3ee86942
MV
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
117 can be dropped.
118 */
119
ba54a202
LC
120 SCM buf;
121
56a3dcd4 122#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
123 if (len < 1000)
124 lenhist[len]++;
125 else
126 lenhist[1000]++;
127#endif
0f2d19dd 128
21041372 129 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
ba54a202
LC
130 "string"));
131
132 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
133 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
134
135 STRINGBUF_CHARS (buf)[len] = 0;
136
137 return buf;
3ee86942 138}
e53cc817 139
50b1996f
MG
140/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
141 characters. */
9c44cd45
MG
142static SCM
143make_wide_stringbuf (size_t len)
144{
ba54a202
LC
145 SCM buf;
146 size_t raw_len;
147
56a3dcd4 148#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
149 if (len < 1000)
150 lenhist[len]++;
151 else
152 lenhist[1000]++;
153#endif
154
ba54a202 155 raw_len = (len + 1) * sizeof (scm_t_wchar);
21041372 156 buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
ba54a202
LC
157 "string"));
158
159 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
160 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
161
162 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
163
164 return buf;
9c44cd45
MG
165}
166
ba54a202
LC
167/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
169static SCM
170wide_stringbuf (SCM buf)
9c44cd45 171{
ba54a202 172 SCM new_buf;
9c44cd45
MG
173
174 if (STRINGBUF_WIDE (buf))
ba54a202
LC
175 new_buf = buf;
176 else
9c44cd45 177 {
ba54a202
LC
178 size_t i, len;
179 scm_t_wchar *mem;
9c44cd45 180
ba54a202 181 len = STRINGBUF_LENGTH (buf);
9c44cd45 182
ba54a202 183 new_buf = make_wide_stringbuf (len);
9c44cd45 184
ba54a202 185 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 186 for (i = 0; i < len; i++)
ba54a202 187 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 188 mem[len] = 0;
9c44cd45 189 }
ba54a202
LC
190
191 return new_buf;
3ee86942 192}
bd9e24b3 193
ba54a202
LC
194/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
196static SCM
587a3355
MG
197narrow_stringbuf (SCM buf)
198{
ba54a202 199 SCM new_buf;
587a3355
MG
200
201 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
202 new_buf = buf;
203 else
204 {
205 size_t i, len;
206 scm_t_wchar *wmem;
207 unsigned char *mem;
587a3355 208
ba54a202
LC
209 len = STRINGBUF_LENGTH (buf);
210 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 211
ba54a202
LC
212 for (i = 0; i < len; i++)
213 if (wmem[i] > 0xFF)
214 /* BUF cannot be narrowed. */
215 return buf;
587a3355 216
ba54a202 217 new_buf = make_stringbuf (len);
587a3355 218
ba54a202
LC
219 mem = STRINGBUF_CHARS (new_buf);
220 for (i = 0; i < len; i++)
221 mem[i] = (unsigned char) wmem[i];
222 mem[len] = 0;
223 }
224
225 return new_buf;
587a3355
MG
226}
227
9de87eea 228scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 229
ba54a202 230\f
3ee86942
MV
231/* Copy-on-write strings.
232 */
bd9e24b3 233
3ee86942 234#define STRING_TAG scm_tc7_string
bd9e24b3 235
3ee86942
MV
236#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 239
3ee86942
MV
240#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
242
dc7da0be 243#define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
3ee86942 244
ed35de72
MV
245/* Read-only strings.
246 */
247
35920c00 248#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
249#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
250
e1b29f6a
MV
251/* Mutation-sharing substrings
252 */
253
254#define SH_STRING_TAG (scm_tc7_string + 0x100)
255
256#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257/* START and LENGTH as for STRINGs. */
258
259#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
260
e7efe8e7
AW
261SCM scm_nullstr;
262
50b1996f
MG
263/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
3ee86942 267SCM
190d4b0d 268scm_i_make_string (size_t len, char **charsp, int read_only_p)
3ee86942
MV
269{
270 SCM buf = make_stringbuf (len);
271 SCM res;
272 if (charsp)
f59cf998 273 *charsp = (char *) STRINGBUF_CHARS (buf);
190d4b0d
LC
274 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
275 SCM_UNPACK (buf),
276 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 277 return res;
0f2d19dd
JB
278}
279
50b1996f
MG
280/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
190d4b0d
LC
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
9c44cd45 284SCM
190d4b0d 285scm_i_make_wide_string (size_t len, scm_t_wchar **charsp, int read_only_p)
9c44cd45
MG
286{
287 SCM buf = make_wide_stringbuf (len);
288 SCM res;
289 if (charsp)
290 *charsp = STRINGBUF_WIDE_CHARS (buf);
190d4b0d
LC
291 res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
292 SCM_UNPACK (buf),
9c44cd45
MG
293 (scm_t_bits) 0, (scm_t_bits) len);
294 return res;
295}
296
3ee86942
MV
297static void
298validate_substring_args (SCM str, size_t start, size_t end)
299{
300 if (!IS_STRING (str))
301 scm_wrong_type_arg_msg (NULL, 0, str, "string");
302 if (start > STRING_LENGTH (str))
303 scm_out_of_range (NULL, scm_from_size_t (start));
304 if (end > STRING_LENGTH (str) || end < start)
305 scm_out_of_range (NULL, scm_from_size_t (end));
306}
0f2d19dd 307
e1b29f6a
MV
308static inline void
309get_str_buf_start (SCM *str, SCM *buf, size_t *start)
310{
311 *start = STRING_START (*str);
312 if (IS_SH_STRING (*str))
313 {
314 *str = SH_STRING_STRING (*str);
315 *start += STRING_START (*str);
316 }
317 *buf = STRING_STRINGBUF (*str);
318}
319
3ee86942
MV
320SCM
321scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 322{
e1b29f6a
MV
323 SCM buf;
324 size_t str_start;
325 get_str_buf_start (&str, &buf, &str_start);
9de87eea 326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 327 SET_STRINGBUF_SHARED (buf);
9de87eea 328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 329 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
330 (scm_t_bits)str_start + start,
331 (scm_t_bits) end - start);
0f2d19dd
JB
332}
333
ed35de72
MV
334SCM
335scm_i_substring_read_only (SCM str, size_t start, size_t end)
336{
45a9f430
LC
337 SCM buf;
338 size_t str_start;
339 get_str_buf_start (&str, &buf, &str_start);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
341 SET_STRINGBUF_SHARED (buf);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
343 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
344 (scm_t_bits)str_start + start,
345 (scm_t_bits) end - start);
ed35de72
MV
346}
347
3ee86942
MV
348SCM
349scm_i_substring_copy (SCM str, size_t start, size_t end)
350{
351 size_t len = end - start;
edea856c 352 SCM buf, my_buf;
e1b29f6a
MV
353 size_t str_start;
354 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
355 if (scm_i_is_narrow_string (str))
356 {
357 my_buf = make_stringbuf (len);
358 memcpy (STRINGBUF_CHARS (my_buf),
359 STRINGBUF_CHARS (buf) + str_start + start, len);
360 }
361 else
362 {
363 my_buf = make_wide_stringbuf (len);
364 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
365 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
366 + start), len);
367 /* Even though this string is wide, the substring may be narrow.
50b1996f 368 Consider adding code to narrow the string. */
9c44cd45 369 }
3ee86942 370 scm_remember_upto_here_1 (buf);
9c44cd45
MG
371 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
372 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 373}
0f2d19dd 374
e1b29f6a
MV
375SCM
376scm_i_substring_shared (SCM str, size_t start, size_t end)
377{
378 if (start == 0 && end == STRING_LENGTH (str))
379 return str;
380 else
381 {
382 size_t len = end - start;
383 if (IS_SH_STRING (str))
384 {
385 start += STRING_START (str);
386 str = SH_STRING_STRING (str);
387 }
388 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
389 (scm_t_bits)start, (scm_t_bits) len);
390 }
391}
392
3ee86942
MV
393SCM
394scm_c_substring (SCM str, size_t start, size_t end)
395{
396 validate_substring_args (str, start, end);
397 return scm_i_substring (str, start, end);
398}
ee149d03 399
ed35de72
MV
400SCM
401scm_c_substring_read_only (SCM str, size_t start, size_t end)
402{
403 validate_substring_args (str, start, end);
404 return scm_i_substring_read_only (str, start, end);
405}
406
0f2d19dd 407SCM
3ee86942 408scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 409{
3ee86942
MV
410 validate_substring_args (str, start, end);
411 return scm_i_substring_copy (str, start, end);
412}
413
3ee86942
MV
414SCM
415scm_c_substring_shared (SCM str, size_t start, size_t end)
416{
417 validate_substring_args (str, start, end);
418 return scm_i_substring_shared (str, start, end);
419}
0f2d19dd 420
d6c74168 421\f
3ee86942
MV
422/* Internal accessors
423 */
424
50b1996f
MG
425/* Returns the number of characters in STR. This may be different
426 than the memory size of the string storage. */
3ee86942
MV
427size_t
428scm_i_string_length (SCM str)
0f2d19dd 429{
3ee86942 430 return STRING_LENGTH (str);
0f2d19dd
JB
431}
432
50b1996f
MG
433/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
434 encoding. False if it is 'wide', having a 32-bit UCS-4
435 encoding. */
9c44cd45
MG
436int
437scm_i_is_narrow_string (SCM str)
438{
439 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
440}
441
587a3355
MG
442/* Try to coerce a string to be narrow. It if is narrow already, do
443 nothing. If it is wide, shrink it to narrow if none of its
444 characters are above 0xFF. Return true if the string is narrow or
445 was made to be narrow. */
446int
447scm_i_try_narrow_string (SCM str)
448{
ba54a202 449 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
450
451 return scm_i_is_narrow_string (str);
452}
453
100e20c7
LC
454/* Return a pointer to the raw data of the string, which can be either Latin-1
455 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
456const void *
457scm_i_string_data (SCM str)
458{
459 SCM buf;
460 size_t start;
461 const char *data;
462
463 get_str_buf_start (&str, &buf, &start);
464
465 data = STRINGBUF_CONTENTS (buf);
466 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
467
468 return data;
469}
470
50b1996f
MG
471/* Returns a pointer to the 8-bit Latin-1 encoded character array of
472 STR. */
3ee86942
MV
473const char *
474scm_i_string_chars (SCM str)
475{
476 SCM buf;
e1b29f6a
MV
477 size_t start;
478 get_str_buf_start (&str, &buf, &start);
9c44cd45 479 if (scm_i_is_narrow_string (str))
f59cf998 480 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
481 else
482 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
483 scm_list_1 (str));
484 return NULL;
3ee86942 485}
b00418df 486
50b1996f
MG
487/* Returns a pointer to the 32-bit UCS-4 encoded character array of
488 STR. */
9c44cd45
MG
489const scm_t_wchar *
490scm_i_string_wide_chars (SCM str)
491{
492 SCM buf;
493 size_t start;
494
495 get_str_buf_start (&str, &buf, &start);
496 if (!scm_i_is_narrow_string (str))
f59cf998 497 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
498 else
499 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
500 scm_list_1 (str));
501}
502
503/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
504 a new string buffer, so that it can be modified without modifying
50b1996f
MG
505 other strings. Also, lock the string mutex. Later, one must call
506 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
507SCM
508scm_i_string_start_writing (SCM orig_str)
b00418df 509{
ed35de72 510 SCM buf, str = orig_str;
e1b29f6a 511 size_t start;
ed35de72 512
e1b29f6a 513 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
514 if (IS_RO_STRING (str))
515 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
516
9de87eea 517 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
518 if (STRINGBUF_SHARED (buf))
519 {
9c44cd45 520 /* Clone the stringbuf. */
3ee86942
MV
521 size_t len = STRING_LENGTH (str);
522 SCM new_buf;
523
9de87eea 524 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 525
9c44cd45
MG
526 if (scm_i_is_narrow_string (str))
527 {
528 new_buf = make_stringbuf (len);
529 memcpy (STRINGBUF_CHARS (new_buf),
530 STRINGBUF_CHARS (buf) + STRING_START (str), len);
531
532 }
533 else
534 {
535 new_buf = make_wide_stringbuf (len);
536 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
537 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
538 + STRING_START (str)), len);
539 }
3ee86942 540
3ee86942
MV
541 SET_STRING_STRINGBUF (str, new_buf);
542 start -= STRING_START (str);
902578f1
LC
543
544 /* FIXME: The following operations are not atomic, so other threads
545 looking at STR may see an inconsistent state. Nevertheless it can't
546 hurt much since (i) accessing STR while it is being mutated can't
547 yield a crash, and (ii) concurrent accesses to STR should be
548 protected by a mutex at the application level. The latter may not
549 apply when STR != ORIG_STR, though. */
3ee86942 550 SET_STRING_START (str, 0);
902578f1 551 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
552
553 buf = new_buf;
554
9de87eea 555 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 556 }
9c44cd45
MG
557 return orig_str;
558}
3ee86942 559
50b1996f 560/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
561char *
562scm_i_string_writable_chars (SCM str)
563{
564 SCM buf;
565 size_t start;
3ee86942 566
9c44cd45
MG
567 get_str_buf_start (&str, &buf, &start);
568 if (scm_i_is_narrow_string (str))
f59cf998 569 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
570 else
571 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
572 scm_list_1 (str));
573 return NULL;
b00418df
DH
574}
575
50b1996f 576/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
577static scm_t_wchar *
578scm_i_string_writable_wide_chars (SCM str)
579{
580 SCM buf;
581 size_t start;
582
583 get_str_buf_start (&str, &buf, &start);
584 if (!scm_i_is_narrow_string (str))
585 return STRINGBUF_WIDE_CHARS (buf) + start;
586 else
1c7b216f 587 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 588 scm_list_1 (str));
b00418df
DH
589}
590
50b1996f
MG
591/* Unlock the string mutex that was locked when
592 scm_i_string_start_writing was called. */
3ee86942
MV
593void
594scm_i_string_stop_writing (void)
595{
9de87eea 596 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 597}
b00418df 598
50b1996f 599/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
600scm_t_wchar
601scm_i_string_ref (SCM str, size_t x)
602{
603 if (scm_i_is_narrow_string (str))
604 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
605 else
606 return scm_i_string_wide_chars (str)[x];
607}
608
889975e5
MG
609/* Returns index+1 of the first char in STR that matches C, or
610 0 if the char is not found. */
611int
612scm_i_string_contains_char (SCM str, char ch)
613{
614 size_t i;
615 size_t len = scm_i_string_length (str);
616
617 i = 0;
618 if (scm_i_is_narrow_string (str))
619 {
620 while (i < len)
621 {
622 if (scm_i_string_chars (str)[i] == ch)
623 return i+1;
624 i++;
625 }
626 }
627 else
628 {
629 while (i < len)
630 {
631 if (scm_i_string_wide_chars (str)[i]
632 == (unsigned char) ch)
633 return i+1;
634 i++;
635 }
636 }
637 return 0;
638}
639
3f47e526
MG
640int
641scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
642{
643 if (scm_i_is_narrow_string (sstr))
644 {
645 const char *a = scm_i_string_chars (sstr) + start_x;
646 const char *b = cstr;
647 return strncmp (a, b, strlen(b));
648 }
649 else
650 {
651 size_t i;
652 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
653 const char *b = cstr;
654 for (i = 0; i < strlen (b); i++)
655 {
656 if (a[i] != (unsigned char) b[i])
657 return 1;
658 }
659 }
660 return 0;
661}
662
50b1996f 663/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
664void
665scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
666{
667 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 668 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
669
670 if (scm_i_is_narrow_string (str))
671 {
672 char *dst = scm_i_string_writable_chars (str);
587a3355 673 dst[p] = chr;
9c44cd45
MG
674 }
675 else
676 {
677 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
678 dst[p] = chr;
679 }
680}
681
ba54a202 682\f
3ee86942 683/* Symbols.
587a3355 684
3ee86942
MV
685 Basic symbol creation and accessing is done here, the rest is in
686 symbols.[hc]. This has been done to keep stringbufs and the
687 internals of strings and string-like objects confined to this file.
688*/
689
690#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
691
692SCM
6869328b
MV
693scm_i_make_symbol (SCM name, scm_t_bits flags,
694 unsigned long hash, SCM props)
3ee86942
MV
695{
696 SCM buf;
697 size_t start = STRING_START (name);
698 size_t length = STRING_LENGTH (name);
699
700 if (IS_SH_STRING (name))
701 {
702 name = SH_STRING_STRING (name);
703 start += STRING_START (name);
704 }
705 buf = SYMBOL_STRINGBUF (name);
706
707 if (start == 0 && length == STRINGBUF_LENGTH (buf))
708 {
709 /* reuse buf. */
9de87eea 710 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 711 SET_STRINGBUF_SHARED (buf);
9de87eea 712 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
713 }
714 else
715 {
716 /* make new buf. */
9c44cd45
MG
717 if (scm_i_is_narrow_string (name))
718 {
719 SCM new_buf = make_stringbuf (length);
720 memcpy (STRINGBUF_CHARS (new_buf),
721 STRINGBUF_CHARS (buf) + start, length);
722 buf = new_buf;
723 }
724 else
725 {
726 SCM new_buf = make_wide_stringbuf (length);
727 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
728 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
729 length);
730 buf = new_buf;
731 }
3ee86942 732 }
6869328b 733 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
734 (scm_t_bits) hash, SCM_UNPACK (props));
735}
736
fd0a5bbc
HWN
737SCM
738scm_i_c_make_symbol (const char *name, size_t len,
739 scm_t_bits flags, unsigned long hash, SCM props)
740{
741 SCM buf = make_stringbuf (len);
742 memcpy (STRINGBUF_CHARS (buf), name, len);
743
65619ebe
AW
744 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
745 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
746}
747
50b1996f
MG
748/* Returns the number of characters in SYM. This may be different
749 from the memory size of SYM. */
3ee86942
MV
750size_t
751scm_i_symbol_length (SCM sym)
0f2d19dd 752{
3ee86942 753 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
754}
755
071bb6a8
LC
756size_t
757scm_c_symbol_length (SCM sym)
758#define FUNC_NAME "scm_c_symbol_length"
759{
760 SCM_VALIDATE_SYMBOL (1, sym);
761
762 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
763}
764#undef FUNC_NAME
765
50b1996f
MG
766/* True if the name of SYM is stored as a Latin-1 encoded string.
767 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
768int
769scm_i_is_narrow_symbol (SCM sym)
770{
771 SCM buf;
772
773 buf = SYMBOL_STRINGBUF (sym);
774 return !STRINGBUF_WIDE (buf);
775}
776
50b1996f
MG
777/* Returns a pointer to the 8-bit Latin-1 encoded character array that
778 contains the name of SYM. */
3ee86942
MV
779const char *
780scm_i_symbol_chars (SCM sym)
781{
9c44cd45
MG
782 SCM buf;
783
784 buf = SYMBOL_STRINGBUF (sym);
785 if (!STRINGBUF_WIDE (buf))
f59cf998 786 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
787 else
788 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
789 scm_list_1 (sym));
790}
791
50b1996f
MG
792/* Return a pointer to the 32-bit UCS-4-encoded character array of a
793 symbol's name. */
9c44cd45
MG
794const scm_t_wchar *
795scm_i_symbol_wide_chars (SCM sym)
796{
797 SCM buf;
798
799 buf = SYMBOL_STRINGBUF (sym);
800 if (STRINGBUF_WIDE (buf))
f59cf998 801 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
802 else
803 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
804 scm_list_1 (sym));
3ee86942 805}
1cc91f1b 806
be54b15d 807SCM
3ee86942 808scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 809{
3ee86942 810 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 811 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 812 SET_STRINGBUF_SHARED (buf);
9de87eea 813 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 814 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
815 (scm_t_bits)start, (scm_t_bits) end - start);
816}
be54b15d 817
50b1996f 818/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
819scm_t_wchar
820scm_i_symbol_ref (SCM sym, size_t x)
821{
822 if (scm_i_is_narrow_symbol (sym))
823 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
824 else
825 return scm_i_symbol_wide_chars (sym)[x];
826}
827
3ee86942
MV
828/* Debugging
829 */
be54b15d 830
6ce6923b
MG
831SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
832 "Returns an association list containing debugging information\n"
833 "for @var{str}. The association list has the following entries."
834 "@table @code\n"
835 "@item string\n"
836 "The string itself.\n"
837 "@item start\n"
838 "The start index of the string into its stringbuf\n"
839 "@item length\n"
840 "The length of the string\n"
841 "@item shared\n"
842 "If this string is a substring, it returns its parent string.\n"
843 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
844 "@item read-only\n"
845 "@code{#t} if the string is read-only\n"
6ce6923b
MG
846 "@item stringbuf-chars\n"
847 "A new string containing this string's stringbuf's characters\n"
848 "@item stringbuf-length\n"
849 "The number of characters in this stringbuf\n"
850 "@item stringbuf-shared\n"
851 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
852 "@item stringbuf-wide\n"
853 "@code{#t} if this stringbuf's characters are stored in a\n"
854 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
855 "buffer\n"
856 "@end table")
3ee86942
MV
857#define FUNC_NAME s_scm_sys_string_dump
858{
ba54a202 859 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 860 SCM buf;
3ee86942 861 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
862
863 /* String info */
4a655e50 864 e1 = scm_cons (scm_from_latin1_symbol ("string"),
6ce6923b 865 str);
4a655e50 866 e2 = scm_cons (scm_from_latin1_symbol ("start"),
6ce6923b 867 scm_from_size_t (STRING_START (str)));
4a655e50 868 e3 = scm_cons (scm_from_latin1_symbol ("length"),
6ce6923b
MG
869 scm_from_size_t (STRING_LENGTH (str)));
870
3ee86942
MV
871 if (IS_SH_STRING (str))
872 {
4a655e50 873 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
874 SH_STRING_STRING (str));
875 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
876 }
877 else
878 {
4a655e50 879 e4 = scm_cons (scm_from_latin1_symbol ("shared"),
6ce6923b
MG
880 SCM_BOOL_F);
881 buf = STRING_STRINGBUF (str);
3ee86942 882 }
9c44cd45 883
88ed5759 884 if (IS_RO_STRING (str))
4a655e50 885 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759
MG
886 SCM_BOOL_T);
887 else
4a655e50 888 e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
88ed5759 889 SCM_BOOL_F);
587a3355 890
6ce6923b 891 /* Stringbuf info */
6ce6923b
MG
892 if (!STRINGBUF_WIDE (buf))
893 {
894 size_t len = STRINGBUF_LENGTH (buf);
895 char *cbuf;
190d4b0d 896 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 897 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 898 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b 899 sbc);
3ee86942 900 }
6ce6923b
MG
901 else
902 {
903 size_t len = STRINGBUF_LENGTH (buf);
904 scm_t_wchar *cbuf;
190d4b0d 905 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
906 u32_cpy ((scm_t_uint32 *) cbuf,
907 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 908 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
909 sbc);
910 }
4a655e50 911 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
912 scm_from_size_t (STRINGBUF_LENGTH (buf)));
913 if (STRINGBUF_SHARED (buf))
4a655e50 914 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
915 SCM_BOOL_T);
916 else
4a655e50 917 e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 918 SCM_BOOL_F);
6ce6923b 919 if (STRINGBUF_WIDE (buf))
4a655e50 920 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 921 SCM_BOOL_T);
6ce6923b 922 else
4a655e50 923 e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
ba54a202 924 SCM_BOOL_F);
6ce6923b 925
ba54a202 926 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
927}
928#undef FUNC_NAME
929
6ce6923b
MG
930SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
931 "Returns an association list containing debugging information\n"
932 "for @var{sym}. The association list has the following entries."
933 "@table @code\n"
934 "@item symbol\n"
935 "The symbol itself\n"
936 "@item hash\n"
937 "Its hash value\n"
88ed5759
MG
938 "@item interned\n"
939 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
940 "@item stringbuf-chars\n"
941 "A new string containing this symbols's stringbuf's characters\n"
942 "@item stringbuf-length\n"
943 "The number of characters in this stringbuf\n"
944 "@item stringbuf-shared\n"
945 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
946 "@item stringbuf-wide\n"
947 "@code{#t} if this stringbuf's characters are stored in a\n"
948 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
949 "buffer\n"
950 "@end table")
3ee86942
MV
951#define FUNC_NAME s_scm_sys_symbol_dump
952{
ba54a202 953 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 954 SCM buf;
3ee86942 955 SCM_VALIDATE_SYMBOL (1, sym);
4a655e50 956 e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
6ce6923b 957 sym);
4a655e50 958 e2 = scm_cons (scm_from_latin1_symbol ("hash"),
6ce6923b 959 scm_from_ulong (scm_i_symbol_hash (sym)));
4a655e50 960 e3 = scm_cons (scm_from_latin1_symbol ("interned"),
88ed5759 961 scm_symbol_interned_p (sym));
6ce6923b
MG
962 buf = SYMBOL_STRINGBUF (sym);
963
964 /* Stringbuf info */
6ce6923b
MG
965 if (!STRINGBUF_WIDE (buf))
966 {
967 size_t len = STRINGBUF_LENGTH (buf);
968 char *cbuf;
190d4b0d 969 SCM sbc = scm_i_make_string (len, &cbuf, 0);
6ce6923b 970 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
4a655e50 971 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
972 sbc);
973 }
9c44cd45 974 else
6ce6923b
MG
975 {
976 size_t len = STRINGBUF_LENGTH (buf);
977 scm_t_wchar *cbuf;
190d4b0d 978 SCM sbc = scm_i_make_wide_string (len, &cbuf, 0);
6ce6923b
MG
979 u32_cpy ((scm_t_uint32 *) cbuf,
980 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
4a655e50 981 e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
6ce6923b
MG
982 sbc);
983 }
4a655e50 984 e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
6ce6923b
MG
985 scm_from_size_t (STRINGBUF_LENGTH (buf)));
986 if (STRINGBUF_SHARED (buf))
4a655e50 987 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b
MG
988 SCM_BOOL_T);
989 else
4a655e50 990 e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
6ce6923b 991 SCM_BOOL_F);
6ce6923b 992 if (STRINGBUF_WIDE (buf))
4a655e50 993 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b
MG
994 SCM_BOOL_T);
995 else
4a655e50 996 e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
6ce6923b 997 SCM_BOOL_F);
ba54a202 998 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 999
3ee86942
MV
1000}
1001#undef FUNC_NAME
1002
56a3dcd4 1003#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 1004
9c44cd45 1005SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1006#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1007{
1008 int i;
1009 for (i = 0; i < 1000; i++)
1010 if (lenhist[i])
1011 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1012 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1013 return SCM_UNSPECIFIED;
be54b15d
DH
1014}
1015#undef FUNC_NAME
1016
3ee86942
MV
1017#endif
1018
1019\f
1020
1021SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1022 (SCM obj),
1023 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1024#define FUNC_NAME s_scm_string_p
1025{
1026 return scm_from_bool (IS_STRING (obj));
1027}
1028#undef FUNC_NAME
1029
1030
1031SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1032
1033SCM_DEFINE (scm_string, "string", 0, 0, 1,
1034 (SCM chrs),
1035 "@deffnx {Scheme Procedure} list->string chrs\n"
1036 "Return a newly allocated string composed of the arguments,\n"
1037 "@var{chrs}.")
1038#define FUNC_NAME s_scm_string
1039{
9aa27c1a 1040 SCM result = SCM_BOOL_F;
9c44cd45 1041 SCM rest;
3ee86942 1042 size_t len;
9c44cd45
MG
1043 size_t p = 0;
1044 long i;
9aa27c1a 1045 int wide = 0;
3ee86942 1046
9c44cd45
MG
1047 /* Verify that this is a list of chars. */
1048 i = scm_ilength (chrs);
3c7cf7f5 1049 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1050
9c44cd45
MG
1051 len = (size_t) i;
1052 rest = chrs;
3ee86942 1053
9c44cd45 1054 while (len > 0 && scm_is_pair (rest))
3ee86942 1055 {
9c44cd45 1056 SCM elt = SCM_CAR (rest);
3ee86942 1057 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1058 if (SCM_CHAR (elt) > 0xFF)
1059 wide = 1;
9c44cd45
MG
1060 rest = SCM_CDR (rest);
1061 len--;
1062 scm_remember_upto_here_1 (elt);
1063 }
1064
1065 /* Construct a string containing this list of chars. */
1066 len = (size_t) i;
1067 rest = chrs;
1068
9aa27c1a 1069 if (wide == 0)
9c44cd45 1070 {
56a3dcd4
LC
1071 char *buf;
1072
190d4b0d 1073 result = scm_i_make_string (len, NULL, 0);
9aa27c1a 1074 result = scm_i_string_start_writing (result);
56a3dcd4 1075 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1076 while (len > 0 && scm_is_pair (rest))
1077 {
1078 SCM elt = SCM_CAR (rest);
1079 buf[p] = (unsigned char) SCM_CHAR (elt);
1080 p++;
1081 rest = SCM_CDR (rest);
1082 len--;
1083 scm_remember_upto_here_1 (elt);
1084 }
1085 }
1086 else
1087 {
56a3dcd4
LC
1088 scm_t_wchar *buf;
1089
190d4b0d 1090 result = scm_i_make_wide_string (len, NULL, 0);
9aa27c1a 1091 result = scm_i_string_start_writing (result);
56a3dcd4 1092 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1093 while (len > 0 && scm_is_pair (rest))
1094 {
1095 SCM elt = SCM_CAR (rest);
1096 buf[p] = SCM_CHAR (elt);
1097 p++;
1098 rest = SCM_CDR (rest);
1099 len--;
1100 scm_remember_upto_here_1 (elt);
1101 }
3ee86942 1102 }
9c44cd45
MG
1103 scm_i_string_stop_writing ();
1104
3ee86942
MV
1105 if (len > 0)
1106 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1107 if (!scm_is_null (rest))
3ee86942
MV
1108 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1109
1110 return result;
1111}
1112#undef FUNC_NAME
be54b15d 1113
3b3b36dd 1114SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1115 (SCM k, SCM chr),
0d26a824
MG
1116 "Return a newly allocated string of\n"
1117 "length @var{k}. If @var{chr} is given, then all elements of\n"
1118 "the string are initialized to @var{chr}, otherwise the contents\n"
3ef6650d 1119 "of the @var{string} are all set to @var{#\nul}.")
1bbd0b84 1120#define FUNC_NAME s_scm_make_string
0f2d19dd 1121{
3ee86942
MV
1122 return scm_c_make_string (scm_to_size_t (k), chr);
1123}
1124#undef FUNC_NAME
1125
1126SCM
1127scm_c_make_string (size_t len, SCM chr)
1128#define FUNC_NAME NULL
1129{
9c44cd45 1130 size_t p;
3ef6650d 1131 char *contents = NULL;
190d4b0d 1132 SCM res = scm_i_make_string (len, &contents, 0);
cb0d8be2 1133
3ef6650d
AW
1134 /* If no char is given, initialize string contents to NULL. */
1135 if (SCM_UNBNDP (chr))
1136 memset (contents, 0, len);
1137 else
e11e83f3 1138 {
3ee86942 1139 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1140 res = scm_i_string_start_writing (res);
1141 for (p = 0; p < len; p++)
1142 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1143 scm_i_string_stop_writing ();
0f2d19dd 1144 }
e11e83f3
MV
1145
1146 return res;
0f2d19dd 1147}
1bbd0b84 1148#undef FUNC_NAME
0f2d19dd 1149
3b3b36dd 1150SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1151 (SCM string),
1152 "Return the number of characters in @var{string}.")
1bbd0b84 1153#define FUNC_NAME s_scm_string_length
0f2d19dd 1154{
d1ca2c64 1155 SCM_VALIDATE_STRING (1, string);
3ee86942 1156 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1157}
1bbd0b84 1158#undef FUNC_NAME
0f2d19dd 1159
f8ba2bb9 1160SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1161 (SCM string),
1162 "Return the bytes used to represent a character in @var{string}."
1163 "This will return 1 or 4.")
f8ba2bb9 1164#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1165{
1166 SCM_VALIDATE_STRING (1, string);
1167 if (!scm_i_is_narrow_string (string))
1168 return scm_from_int (4);
1169
1170 return scm_from_int (1);
1171}
1172#undef FUNC_NAME
1173
3ee86942
MV
1174size_t
1175scm_c_string_length (SCM string)
1176{
1177 if (!IS_STRING (string))
1178 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1179 return STRING_LENGTH (string);
1180}
1181
bd9e24b3 1182SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1183 (SCM str, SCM k),
9c44cd45
MG
1184 "Return character @var{k} of @var{str} using zero-origin\n"
1185 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1186#define FUNC_NAME s_scm_string_ref
0f2d19dd 1187{
3ae3166b 1188 size_t len;
a55c2b68 1189 unsigned long idx;
bd9e24b3 1190
d1ca2c64 1191 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1192
1193 len = scm_i_string_length (str);
1194 if (SCM_LIKELY (len > 0))
1195 idx = scm_to_unsigned_integer (k, 0, len - 1);
1196 else
1197 scm_out_of_range (NULL, k);
1198
9c44cd45
MG
1199 if (scm_i_is_narrow_string (str))
1200 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1201 else
1202 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1203}
1bbd0b84 1204#undef FUNC_NAME
0f2d19dd 1205
3ee86942
MV
1206SCM
1207scm_c_string_ref (SCM str, size_t p)
1208{
1209 if (p >= scm_i_string_length (str))
1210 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1211 if (scm_i_is_narrow_string (str))
1212 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1213 else
1214 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1215
3ee86942 1216}
f0942910 1217
3b3b36dd 1218SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1219 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1220 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1221 "an unspecified value. @var{k} must be a valid index of\n"
1222 "@var{str}.")
1bbd0b84 1223#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1224{
3ae3166b 1225 size_t len;
a55c2b68
MV
1226 unsigned long idx;
1227
f0942910 1228 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1229
1230 len = scm_i_string_length (str);
1231 if (SCM_LIKELY (len > 0))
1232 idx = scm_to_unsigned_integer (k, 0, len - 1);
1233 else
1234 scm_out_of_range (NULL, k);
1235
34d19ef6 1236 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1237 str = scm_i_string_start_writing (str);
1238 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1239 scm_i_string_stop_writing ();
1240
0f2d19dd
JB
1241 return SCM_UNSPECIFIED;
1242}
1bbd0b84 1243#undef FUNC_NAME
0f2d19dd 1244
3ee86942
MV
1245void
1246scm_c_string_set_x (SCM str, size_t p, SCM chr)
1247{
1248 if (p >= scm_i_string_length (str))
1249 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1250 str = scm_i_string_start_writing (str);
1251 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1252 scm_i_string_stop_writing ();
3ee86942 1253}
0f2d19dd 1254
3b3b36dd 1255SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1256 (SCM str, SCM start, SCM end),
1257 "Return a newly allocated string formed from the characters\n"
1258 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1259 "ending with index @var{end} (exclusive).\n"
1260 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1261 "exact integers satisfying:\n\n"
1262 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1263#define FUNC_NAME s_scm_substring
0f2d19dd 1264{
3ee86942 1265 size_t len, from, to;
685c0d71 1266
d1ca2c64 1267 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1268 len = scm_i_string_length (str);
1269 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1270 if (SCM_UNBNDP (end))
3ee86942 1271 to = len;
a55c2b68 1272 else
3ee86942
MV
1273 to = scm_to_unsigned_integer (end, from, len);
1274 return scm_i_substring (str, from, to);
0f2d19dd 1275}
1bbd0b84 1276#undef FUNC_NAME
0f2d19dd 1277
ed35de72
MV
1278SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1279 (SCM str, SCM start, SCM end),
1280 "Return a newly allocated string formed from the characters\n"
1281 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1282 "ending with index @var{end} (exclusive).\n"
1283 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1284 "exact integers satisfying:\n"
1285 "\n"
1286 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1287 "\n"
1288 "The returned string is read-only.\n")
1289#define FUNC_NAME s_scm_substring_read_only
1290{
1291 size_t len, from, to;
1292
1293 SCM_VALIDATE_STRING (1, str);
1294 len = scm_i_string_length (str);
1295 from = scm_to_unsigned_integer (start, 0, len);
1296 if (SCM_UNBNDP (end))
1297 to = len;
1298 else
1299 to = scm_to_unsigned_integer (end, from, len);
1300 return scm_i_substring_read_only (str, from, to);
1301}
1302#undef FUNC_NAME
1303
3ee86942
MV
1304SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1305 (SCM str, SCM start, SCM end),
1306 "Return a newly allocated string formed from the characters\n"
1307 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1308 "ending with index @var{end} (exclusive).\n"
1309 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1310 "exact integers satisfying:\n\n"
1311 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1312#define FUNC_NAME s_scm_substring_copy
1313{
e1b29f6a
MV
1314 /* For the Scheme version, START is mandatory, but for the C
1315 version, it is optional. See scm_string_copy in srfi-13.c for a
1316 rationale.
1317 */
1318
1319 size_t from, to;
3ee86942
MV
1320
1321 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1322 scm_i_get_substring_spec (scm_i_string_length (str),
1323 start, &from, end, &to);
3ee86942
MV
1324 return scm_i_substring_copy (str, from, to);
1325}
1326#undef FUNC_NAME
1327
1328SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1329 (SCM str, SCM start, SCM end),
1330 "Return string that indirectly refers to the characters\n"
1331 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1332 "ending with index @var{end} (exclusive).\n"
1333 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1334 "exact integers satisfying:\n\n"
1335 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1336#define FUNC_NAME s_scm_substring_shared
1337{
1338 size_t len, from, to;
1339
1340 SCM_VALIDATE_STRING (1, str);
1341 len = scm_i_string_length (str);
1342 from = scm_to_unsigned_integer (start, 0, len);
1343 if (SCM_UNBNDP (end))
1344 to = len;
1345 else
1346 to = scm_to_unsigned_integer (end, from, len);
1347 return scm_i_substring_shared (str, from, to);
1348}
1349#undef FUNC_NAME
685c0d71 1350
3b3b36dd 1351SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1352 (SCM args),
9c44cd45 1353 "Return a newly allocated string whose characters form the\n"
0d26a824 1354 "concatenation of the given strings, @var{args}.")
1bbd0b84 1355#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1356{
1357 SCM res;
9c44cd45
MG
1358 size_t len = 0;
1359 int wide = 0;
c829a427 1360 SCM l, s;
bd4911ef 1361 size_t i;
9909c395
MG
1362 union
1363 {
1364 char *narrow;
1365 scm_t_wchar *wide;
1366 } data;
af45e3b0
DH
1367
1368 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1369 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1370 {
1371 s = SCM_CAR (l);
1372 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1373 len += scm_i_string_length (s);
1374 if (!scm_i_is_narrow_string (s))
1375 wide = 1;
c829a427 1376 }
9909c395 1377 data.narrow = NULL;
9c44cd45 1378 if (!wide)
190d4b0d 1379 res = scm_i_make_string (len, &data.narrow, 0);
9c44cd45 1380 else
190d4b0d 1381 res = scm_i_make_wide_string (len, &data.wide, 0);
9c44cd45
MG
1382
1383 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1384 {
edea856c 1385 size_t len;
c829a427 1386 s = SCM_CAR (l);
3ee86942 1387 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1388 len = scm_i_string_length (s);
9c44cd45
MG
1389 if (!wide)
1390 {
9909c395
MG
1391 memcpy (data.narrow, scm_i_string_chars (s), len);
1392 data.narrow += len;
9c44cd45
MG
1393 }
1394 else
1395 {
1396 if (scm_i_is_narrow_string (s))
1397 {
1398 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1399 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1400 }
1401 else
9909c395 1402 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1403 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1404 data.wide += len;
9c44cd45 1405 }
c829a427
MV
1406 scm_remember_upto_here_1 (s);
1407 }
0f2d19dd
JB
1408 return res;
1409}
1bbd0b84 1410#undef FUNC_NAME
0f2d19dd 1411
24933780 1412
a3d7d5d5 1413\f
c62da8f8 1414/* Charset conversion error handling. */
a3d7d5d5
LC
1415
1416SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
c62da8f8
LC
1417SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
1418
6851d3be
LC
1419/* Raise an exception informing that character CHR could not be written
1420 to PORT in its current encoding. */
d14418a5 1421void
ef7e4ba3 1422scm_encoding_error (const char *subr, int err, const char *message,
6851d3be 1423 SCM port, SCM chr)
ef7e4ba3 1424{
ef7e4ba3 1425 scm_throw (scm_encoding_error_key,
d050ef66
AW
1426 scm_list_n (scm_from_latin1_string (subr),
1427 scm_from_latin1_string (message),
ef7e4ba3 1428 scm_from_int (err),
6851d3be 1429 port, chr,
ef7e4ba3 1430 SCM_UNDEFINED));
a3d7d5d5
LC
1431}
1432
c62da8f8
LC
1433/* Raise an exception informing of an encoding error on PORT. This
1434 means that a character could not be written in PORT's encoding. */
1435void
1436scm_decoding_error (const char *subr, int err, const char *message, SCM port)
1437{
1438 scm_throw (scm_decoding_error_key,
d050ef66
AW
1439 scm_list_n (scm_from_latin1_string (subr),
1440 scm_from_latin1_string (message),
c62da8f8
LC
1441 scm_from_int (err),
1442 port,
1443 SCM_UNDEFINED));
1444}
1445
1446\f
1447/* String conversion to/from C. */
1448
41d1d984
AW
1449static void
1450decoding_error (const char *func_name, int errno_save,
1451 const char *str, size_t len)
1452{
1453 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1454 handler. */
1455 SCM bv;
1456 signed char *buf;
1457
1458 buf = scm_gc_malloc_pointerless (len, "bytevector");
1459 memcpy (buf, str, len);
1460 bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
1461
1462 scm_decoding_error (func_name, errno_save,
1463 "input locale conversion error", bv);
1464}
1465
fac32b51 1466SCM
587a3355
MG
1467scm_from_stringn (const char *str, size_t len, const char *encoding,
1468 scm_t_string_failed_conversion_handler handler)
1469{
1470 size_t u32len, i;
1471 scm_t_wchar *u32;
1472 int wide = 0;
1473 SCM res;
1474
d40e1ca8 1475 /* The order of these checks is important. */
a574564c 1476 if (!str && len != 0)
d40e1ca8
AW
1477 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
1478 if (len == (size_t) -1)
1479 len = strlen (str);
a574564c
AW
1480 if (len == 0)
1481 return scm_nullstr;
fac32b51 1482
889975e5
MG
1483 if (encoding == NULL)
1484 {
1485 /* If encoding is null, use Latin-1. */
1486 char *buf;
190d4b0d 1487 res = scm_i_make_string (len, &buf, 0);
889975e5
MG
1488 memcpy (buf, str, len);
1489 return res;
1490 }
1491
587a3355
MG
1492 u32len = 0;
1493 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1494 (enum iconv_ilseq_handler)
1495 handler,
1496 str, len,
1497 NULL,
1498 NULL, &u32len);
1499
ef7e4ba3 1500 if (SCM_UNLIKELY (u32 == NULL))
41d1d984 1501 decoding_error (__func__, errno, str, len);
587a3355
MG
1502
1503 i = 0;
1504 while (i < u32len)
1505 if (u32[i++] > 0xFF)
1506 {
1507 wide = 1;
1508 break;
1509 }
1510
1511 if (!wide)
1512 {
1513 char *dst;
190d4b0d 1514 res = scm_i_make_string (u32len, &dst, 0);
587a3355
MG
1515 for (i = 0; i < u32len; i ++)
1516 dst[i] = (unsigned char) u32[i];
1517 dst[u32len] = '\0';
1518 }
1519 else
1520 {
1521 scm_t_wchar *wdst;
190d4b0d 1522 res = scm_i_make_wide_string (u32len, &wdst, 0);
587a3355
MG
1523 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1524 wdst[u32len] = 0;
1525 }
1526
1527 free (u32);
1528 return res;
1529}
1530
cf313a94 1531SCM
d40e1ca8 1532scm_from_locale_string (const char *str)
cf313a94 1533{
d40e1ca8 1534 return scm_from_locale_stringn (str, -1);
cf313a94
MG
1535}
1536
c829a427
MV
1537SCM
1538scm_from_locale_stringn (const char *str, size_t len)
1539{
95f5e303
AW
1540 return scm_from_stringn (str, len, locale_charset (),
1541 scm_i_get_conversion_strategy (SCM_BOOL_F));
c829a427 1542}
4d4528e7 1543
c829a427 1544SCM
d40e1ca8 1545scm_from_latin1_string (const char *str)
4d4528e7 1546{
d40e1ca8
AW
1547 return scm_from_latin1_stringn (str, -1);
1548}
9c44cd45 1549
d40e1ca8
AW
1550SCM
1551scm_from_latin1_stringn (const char *str, size_t len)
1552{
e9a35a96
LC
1553 char *buf;
1554 SCM result;
1555
1556 if (len == (size_t) -1)
1557 len = strlen (str);
1558
1559 /* Make a narrow string and copy STR as is. */
190d4b0d 1560 result = scm_i_make_string (len, &buf, 0);
e9a35a96
LC
1561 memcpy (buf, str, len);
1562
1563 return result;
c829a427 1564}
4d4528e7 1565
587a3355 1566SCM
d40e1ca8 1567scm_from_utf8_string (const char *str)
587a3355 1568{
d40e1ca8
AW
1569 return scm_from_utf8_stringn (str, -1);
1570}
1571
1572SCM
1573scm_from_utf8_stringn (const char *str, size_t len)
1574{
41d1d984
AW
1575 size_t i, char_len;
1576 const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
1577 int ascii = 1, narrow = 1;
1578 SCM res;
1579
1580 if (len == (size_t) -1)
1581 len = strlen (str);
1582
1583 i = 0;
1584 char_len = 0;
1585
1586 while (i < len)
1587 {
1588 if (ustr[i] <= 127)
1589 {
1590 char_len++;
1591 i++;
1592 }
1593 else
1594 {
1595 ucs4_t c;
1596 int nbytes;
1597
1598 ascii = 0;
1599
1600 nbytes = u8_mbtouc (&c, ustr + i, len - i);
1601
1602 if (nbytes < 0)
1603 /* Bad UTF-8. */
1604 decoding_error (__func__, errno, str, len);
1605
1606 if (c > 255)
1607 narrow = 0;
1608
1609 char_len++;
1610 i += nbytes;
1611 }
1612 }
1613
1614 if (ascii)
1615 {
1616 char *dst;
1617 res = scm_i_make_string (char_len, &dst, 0);
1618 memcpy (dst, str, len);
1619 }
1620 else if (narrow)
1621 {
1622 char *dst;
1623 size_t j;
1624 ucs4_t c;
1625
1626 res = scm_i_make_string (char_len, &dst, 0);
1627
1628 for (i = 0, j = 0; i < len; i++, j++)
1629 {
1630 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1631 dst[j] = (signed char) c;
1632 }
1633 }
1634 else
1635 {
1636 scm_t_wchar *dst;
1637 size_t j;
1638 ucs4_t c;
1639
1640 res = scm_i_make_wide_string (char_len, &dst, 0);
1641
1642 for (i = 0, j = 0; i < len; i++, j++)
1643 {
1644 i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
1645 dst[j] = c;
1646 }
1647 }
1648
1649 return res;
587a3355
MG
1650}
1651
647dc1ac
LC
1652SCM
1653scm_from_utf32_string (const scm_t_wchar *str)
1654{
1655 return scm_from_utf32_stringn (str, -1);
1656}
1657
1658SCM
1659scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
1660{
1661 SCM result;
1662 scm_t_wchar *buf;
1663
1664 if (len == (size_t) -1)
1665 len = u32_strlen ((uint32_t *) str);
1666
190d4b0d 1667 result = scm_i_make_wide_string (len, &buf, 0);
647dc1ac
LC
1668 memcpy (buf, str, len * sizeof (scm_t_wchar));
1669 scm_i_try_narrow_string (result);
1670
1671 return result;
1672}
1673
50b1996f
MG
1674/* Create a new scheme string from the C string STR. The memory of
1675 STR may be used directly as storage for the new string. */
13a94556
LC
1676/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1677 would be to register a finalizer to eventually free(3) STR, which isn't
1678 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1679SCM
1680scm_take_locale_stringn (char *str, size_t len)
1681{
13a94556 1682 SCM res;
48ddf0d9 1683
13a94556
LC
1684 res = scm_from_locale_stringn (str, len);
1685 free (str);
c829a427 1686
c829a427
MV
1687 return res;
1688}
1689
48ddf0d9
KR
1690SCM
1691scm_take_locale_string (char *str)
1692{
1693 return scm_take_locale_stringn (str, -1);
1694}
1695
f1ee6d54
LC
1696/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1697 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
31d4d02b
LC
1698 Set *LENP to the size of the resulting string.
1699
1700 FIXME: This is a hack we should get rid of. See
1701 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1702 for details. */
1703static void
1704unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1705{
1706 char *before, *after;
1707 size_t i, j;
1708
4ff2b9f4
LC
1709 before = buf;
1710 after = buf;
9c44cd45
MG
1711 i = 0;
1712 j = 0;
1713 while (i < *lenp)
1714 {
1715 if ((i <= *lenp - 6)
1716 && before[i] == '\\'
1717 && before[i + 1] == 'u'
1718 && before[i + 2] == '0' && before[i + 3] == '0')
1719 {
1720 /* Convert \u00NN to \xNN */
1721 after[j] = '\\';
1722 after[j + 1] = 'x';
30a6b9ca
MG
1723 after[j + 2] = tolower ((int) before[i + 4]);
1724 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1725 i += 6;
1726 j += 4;
1727 }
1728 else if ((i <= *lenp - 10)
1729 && before[i] == '\\'
1730 && before[i + 1] == 'U'
1731 && before[i + 2] == '0' && before[i + 3] == '0')
1732 {
1733 /* Convert \U00NNNNNN to \UNNNNNN */
1734 after[j] = '\\';
1735 after[j + 1] = 'U';
30a6b9ca
MG
1736 after[j + 2] = tolower ((int) before[i + 4]);
1737 after[j + 3] = tolower ((int) before[i + 5]);
1738 after[j + 4] = tolower ((int) before[i + 6]);
1739 after[j + 5] = tolower ((int) before[i + 7]);
1740 after[j + 6] = tolower ((int) before[i + 8]);
1741 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1742 i += 10;
1743 j += 8;
1744 }
1745 else
1746 {
1747 after[j] = before[i];
1748 i++;
1749 j++;
1750 }
1751 }
1752 *lenp = j;
9c44cd45
MG
1753}
1754
f1ee6d54
LC
1755/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1756 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1757 of the resulting string. BUF must be large enough to handle the
1758 worst case when `\uXXXX' escapes (6 characters) are replaced by
1759 `\xXXXX;' (7 characters). */
31d4d02b
LC
1760static void
1761unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1762{
1763 char *before, *after;
1764 size_t i, j;
1765 /* The worst case is if the input string contains all 4-digit hex escapes.
1766 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1767 size_t max_out_len = (*lenp * 7) / 6 + 1;
1768 size_t nzeros, ndigits;
1769
4ff2b9f4 1770 before = buf;
d31b9519
MG
1771 after = alloca (max_out_len);
1772 i = 0;
1773 j = 0;
1774 while (i < *lenp)
1775 {
1776 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1777 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1778 {
1779 if (before[i + 1] == 'u')
1780 ndigits = 4;
1781 else if (before[i + 1] == 'U')
1782 ndigits = 8;
1783 else
1784 abort ();
1785
1786 /* Add the R6RS hex escape initial sequence. */
1787 after[j] = '\\';
1788 after[j + 1] = 'x';
1789
1790 /* Move string positions to the start of the hex numbers. */
1791 i += 2;
1792 j += 2;
1793
1794 /* Find the number of initial zeros in this hex number. */
1795 nzeros = 0;
1796 while (before[i + nzeros] == '0' && nzeros < ndigits)
1797 nzeros++;
1798
1799 /* Copy the number, skipping initial zeros, and then move the string
1800 positions. */
1801 if (nzeros == ndigits)
1802 {
1803 after[j] = '0';
1804 i += ndigits;
1805 j += 1;
1806 }
1807 else
1808 {
1809 int pos;
1810 for (pos = 0; pos < ndigits - nzeros; pos++)
1811 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1812 i += ndigits;
1813 j += (ndigits - nzeros);
1814 }
1815
1816 /* Add terminating semicolon. */
1817 after[j] = ';';
1818 j++;
1819 }
1820 else
1821 {
1822 after[j] = before[i];
1823 i++;
1824 j++;
1825 }
1826 }
1827 *lenp = j;
d31b9519
MG
1828 memcpy (before, after, j);
1829}
1830
cf313a94 1831char *
d40e1ca8 1832scm_to_locale_string (SCM str)
cf313a94 1833{
d40e1ca8 1834 return scm_to_locale_stringn (str, NULL);
cf313a94 1835}
d31b9519 1836
c829a427 1837char *
fac32b51 1838scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1839{
889975e5 1840 return scm_to_stringn (str, lenp,
95f5e303 1841 locale_charset (),
889975e5 1842 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1843}
1844
d40e1ca8
AW
1845char *
1846scm_to_latin1_string (SCM str)
1847{
1848 return scm_to_latin1_stringn (str, NULL);
1849}
1850
1851char *
1852scm_to_latin1_stringn (SCM str, size_t *lenp)
e9a35a96 1853#define FUNC_NAME "scm_to_latin1_stringn"
d40e1ca8 1854{
e9a35a96
LC
1855 char *result;
1856
1857 SCM_VALIDATE_STRING (1, str);
1858
1859 if (scm_i_is_narrow_string (str))
1860 {
fe133640
AW
1861 size_t len = scm_i_string_length (str);
1862
e9a35a96 1863 if (lenp)
fe133640 1864 *lenp = len;
e9a35a96 1865
fe133640 1866 result = scm_strndup (scm_i_string_data (str), len);
e9a35a96
LC
1867 }
1868 else
1869 result = scm_to_stringn (str, lenp, NULL,
fe133640 1870 SCM_FAILED_CONVERSION_ERROR);
e9a35a96
LC
1871
1872 return result;
d40e1ca8 1873}
e9a35a96 1874#undef FUNC_NAME
d40e1ca8
AW
1875
1876char *
1877scm_to_utf8_string (SCM str)
1878{
1879 return scm_to_utf8_stringn (str, NULL);
1880}
1881
1882char *
1883scm_to_utf8_stringn (SCM str, size_t *lenp)
1884{
1885 return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1886}
1887
647dc1ac
LC
1888scm_t_wchar *
1889scm_to_utf32_string (SCM str)
1890{
1891 return scm_to_utf32_stringn (str, NULL);
1892}
1893
1894scm_t_wchar *
1895scm_to_utf32_stringn (SCM str, size_t *lenp)
1896#define FUNC_NAME "scm_to_utf32_stringn"
1897{
1898 scm_t_wchar *result;
1899
1900 SCM_VALIDATE_STRING (1, str);
1901
1902 if (scm_i_is_narrow_string (str))
1903 result = (scm_t_wchar *)
1904 scm_to_stringn (str, lenp, "UTF-32",
1905 SCM_FAILED_CONVERSION_ERROR);
1906 else
1907 {
1908 size_t len;
1909
1910 len = scm_i_string_length (str);
1911 if (lenp)
1912 *lenp = len;
1913
1914 result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
1915 memcpy (result, scm_i_string_wide_chars (str),
1916 len * sizeof (scm_t_wchar));
1917 result[len] = 0;
1918 }
1919
1920 return result;
1921}
1922#undef FUNC_NAME
1923
29bcdbb0
LC
1924/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1925 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1926 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1927 defined by HANDLER. */
9c44cd45 1928char *
587a3355 1929scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1930 scm_t_string_failed_conversion_handler handler)
9c44cd45 1931{
9c44cd45
MG
1932 char *buf;
1933 size_t ilen, len, i;
889975e5
MG
1934 int ret;
1935 const char *enc;
4d4528e7 1936
3ee86942 1937 if (!scm_is_string (str))
c829a427 1938 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1939 ilen = scm_i_string_length (str);
1940
1941 if (ilen == 0)
1942 {
1943 buf = scm_malloc (1);
1944 buf[0] = '\0';
1945 if (lenp)
1946 *lenp = 0;
1947 return buf;
1948 }
587a3355 1949
c829a427 1950 if (lenp == NULL)
9c44cd45
MG
1951 for (i = 0; i < ilen; i++)
1952 if (scm_i_string_ref (str, i) == '\0')
1953 scm_misc_error (NULL,
1954 "string contains #\\nul character: ~S",
1955 scm_list_1 (str));
1956
889975e5 1957 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1958 {
889975e5
MG
1959 /* If using native Latin-1 encoding, just copy the string
1960 contents. */
9c44cd45
MG
1961 if (lenp)
1962 {
1963 buf = scm_malloc (ilen);
1964 memcpy (buf, scm_i_string_chars (str), ilen);
1965 *lenp = ilen;
1966 return buf;
1967 }
1968 else
1969 {
1970 buf = scm_malloc (ilen + 1);
1971 memcpy (buf, scm_i_string_chars (str), ilen);
1972 buf[ilen] = '\0';
1973 return buf;
1974 }
c829a427 1975 }
9c44cd45 1976
587a3355 1977
9c44cd45
MG
1978 buf = NULL;
1979 len = 0;
889975e5
MG
1980 enc = encoding;
1981 if (enc == NULL)
1982 enc = "ISO-8859-1";
1983 if (scm_i_is_narrow_string (str))
1984 {
1985 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1986 "ISO-8859-1", enc,
1987 (enum iconv_ilseq_handler) handler, NULL,
1988 &buf, &len);
9c44cd45 1989
889975e5 1990 if (ret != 0)
ef7e4ba3 1991 scm_encoding_error (__func__, errno,
6851d3be
LC
1992 "cannot convert narrow string to output locale",
1993 SCM_BOOL_F,
1994 /* FIXME: Faulty character unknown. */
1995 SCM_BOOL_F);
889975e5
MG
1996 }
1997 else
1998 {
d31b9519 1999 buf = u32_conv_to_encoding (enc,
889975e5 2000 (enum iconv_ilseq_handler) handler,
d31b9519 2001 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
2002 ilen,
2003 NULL,
2004 NULL, &len);
2005 if (buf == NULL)
ef7e4ba3 2006 scm_encoding_error (__func__, errno,
6851d3be
LC
2007 "cannot convert wide string to output locale",
2008 SCM_BOOL_F,
2009 /* FIXME: Faulty character unknown. */
2010 SCM_BOOL_F);
d31b9519
MG
2011 }
2012 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
2013 {
2014 if (SCM_R6RS_ESCAPES_P)
f1ee6d54
LC
2015 {
2016 /* The worst case is if the input string contains all 4-digit
2017 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2018 (seven characters). Make BUF large enough to hold
2019 that. */
2020 buf = scm_realloc (buf, (len * 7) / 6 + 1);
31d4d02b 2021 unistring_escapes_to_r6rs_escapes (buf, &len);
f1ee6d54 2022 }
d31b9519 2023 else
31d4d02b 2024 unistring_escapes_to_guile_escapes (buf, &len);
4ff2b9f4
LC
2025
2026 buf = scm_realloc (buf, len);
889975e5 2027 }
9c44cd45 2028 if (lenp)
4d4528e7 2029 *lenp = len;
9c44cd45
MG
2030 else
2031 {
2032 buf = scm_realloc (buf, len + 1);
2033 buf[len] = '\0';
2034 }
24933780 2035
c829a427 2036 scm_remember_upto_here_1 (str);
9c44cd45 2037 return buf;
4d4528e7 2038}
af68e5e5 2039
c829a427
MV
2040size_t
2041scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
2042{
2043 size_t len;
9c44cd45 2044 char *result = NULL;
3ee86942 2045 if (!scm_is_string (str))
c829a427 2046 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
2047 result = scm_to_locale_stringn (str, &len);
2048
2049 memcpy (buf, result, (len > max_len) ? max_len : len);
2050 free (result);
2051
c829a427
MV
2052 scm_remember_upto_here_1 (str);
2053 return len;
2054}
af68e5e5 2055
a3d7d5d5
LC
2056\f
2057/* Unicode string normalization. */
2058
edb7bb47
JG
2059/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2060 libguile/i18n.c. It would be useful to have this factored out into a more
2061 convenient location, but its use of alloca makes that tricky to do. */
2062
2063static SCM
2064normalize_str (SCM string, uninorm_t form)
2065{
2066 SCM ret;
2067 scm_t_uint32 *w_str;
2068 scm_t_wchar *cbuf;
2069 size_t rlen, len = scm_i_string_length (string);
2070
2071 if (scm_i_is_narrow_string (string))
2072 {
2073 size_t i;
2074 const char *buf = scm_i_string_chars (string);
2075
2076 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
2077
2078 for (i = 0; i < len; i ++)
2079 w_str[i] = (unsigned char) buf[i];
2080 w_str[len] = 0;
2081 }
d8164b04
JG
2082 else
2083 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
2084
edb7bb47
JG
2085 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
2086
190d4b0d 2087 ret = scm_i_make_wide_string (rlen, &cbuf, 0);
edb7bb47
JG
2088 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
2089 free (w_str);
d8164b04
JG
2090
2091 scm_i_try_narrow_string (ret);
2092
edb7bb47
JG
2093 return ret;
2094}
2095
2096SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
2097 (SCM string),
2098 "Returns the NFC normalized form of @var{string}.")
2099#define FUNC_NAME s_scm_string_normalize_nfc
2100{
2101 SCM_VALIDATE_STRING (1, string);
2102 return normalize_str (string, UNINORM_NFC);
2103}
2104#undef FUNC_NAME
2105
2106SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
2107 (SCM string),
2108 "Returns the NFD normalized form of @var{string}.")
2109#define FUNC_NAME s_scm_string_normalize_nfd
2110{
2111 SCM_VALIDATE_STRING (1, string);
2112 return normalize_str (string, UNINORM_NFD);
2113}
2114#undef FUNC_NAME
2115
2116SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
2117 (SCM string),
2118 "Returns the NFKC normalized form of @var{string}.")
2119#define FUNC_NAME s_scm_string_normalize_nfkc
2120{
2121 SCM_VALIDATE_STRING (1, string);
2122 return normalize_str (string, UNINORM_NFKC);
2123}
2124#undef FUNC_NAME
2125
2126SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
2127 (SCM string),
2128 "Returns the NFKD normalized form of @var{string}.")
2129#define FUNC_NAME s_scm_string_normalize_nfkd
2130{
2131 SCM_VALIDATE_STRING (1, string);
2132 return normalize_str (string, UNINORM_NFKD);
2133}
2134#undef FUNC_NAME
2135
7505c6e0
MW
2136/* converts C scm_array of strings to SCM scm_list of strings.
2137 If argc < 0, a null terminated scm_array is assumed.
2138 The current locale encoding is assumed */
9c44cd45 2139SCM
3ee86942
MV
2140scm_makfromstrs (int argc, char **argv)
2141{
2142 int i = argc;
2143 SCM lst = SCM_EOL;
2144 if (0 > i)
2145 for (i = 0; argv[i]; i++);
2146 while (i--)
2147 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
2148 return lst;
2149}
2150
c829a427 2151/* Return a newly allocated array of char pointers to each of the strings
7505c6e0
MW
2152 in args, with a terminating NULL pointer. The strings are encoded using
2153 the current locale. */
c829a427
MV
2154
2155char **
2156scm_i_allocate_string_pointers (SCM list)
2a776823 2157#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 2158{
c829a427 2159 char **result;
7505c6e0 2160 int list_len = scm_ilength (list);
c829a427
MV
2161 int i;
2162
7505c6e0 2163 if (list_len < 0)
c829a427
MV
2164 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
2165
7505c6e0 2166 result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
2a776823 2167 "string pointers");
7505c6e0 2168 result[list_len] = NULL;
c829a427 2169
7505c6e0 2170 /* The list might have been modified in another thread, so
c829a427
MV
2171 we check LIST before each access.
2172 */
7505c6e0 2173 for (i = 0; i < list_len && scm_is_pair (list); i++)
c829a427 2174 {
7505c6e0
MW
2175 SCM str = SCM_CAR (list);
2176 size_t len; /* String length in bytes */
2177 char *c_str = scm_to_locale_stringn (str, &len);
2178
2179 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2180 scm_malloc to allocate the returned string, which must be
2181 explicitly deallocated. This forces us to copy the string a
2182 second time into a new buffer. Ideally there would be variants
2183 of scm_to_*_stringn that can return garbage-collected buffers. */
2184
2185 result[i] = scm_gc_malloc_pointerless (len + 1, "string");
2186 memcpy (result[i], c_str, len);
2a776823 2187 result[i][len] = '\0';
7505c6e0 2188 free (c_str);
2a776823 2189
c829a427
MV
2190 list = SCM_CDR (list);
2191 }
2192
c829a427 2193 return result;
af68e5e5 2194}
2a776823 2195#undef FUNC_NAME
24933780 2196
6f14f578
MV
2197void
2198scm_i_get_substring_spec (size_t len,
2199 SCM start, size_t *cstart,
2200 SCM end, size_t *cend)
2201{
2202 if (SCM_UNBNDP (start))
2203 *cstart = 0;
2204 else
2205 *cstart = scm_to_unsigned_integer (start, 0, len);
2206
2207 if (SCM_UNBNDP (end))
2208 *cend = len;
2209 else
2210 *cend = scm_to_unsigned_integer (end, *cstart, len);
2211}
2212
2a610be5
AW
2213static SCM
2214string_handle_ref (scm_t_array_handle *h, size_t index)
2215{
2216 return scm_c_string_ref (h->array, index);
2217}
2218
2219static void
2220string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2221{
2222 scm_c_string_set_x (h->array, index, val);
2223}
2224
2225static void
2226string_get_handle (SCM v, scm_t_array_handle *h)
2227{
2228 h->array = v;
2229 h->ndims = 1;
2230 h->dims = &h->dim0;
2231 h->dim0.lbnd = 0;
2232 h->dim0.ubnd = scm_c_string_length (v) - 1;
2233 h->dim0.inc = 1;
2234 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2235 h->elements = h->writable_elements = NULL;
2236}
2237
c5f17102 2238SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2239 string_handle_ref, string_handle_set,
f65e0168
LC
2240 string_get_handle)
2241SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2242
0f2d19dd
JB
2243void
2244scm_init_strings ()
0f2d19dd 2245{
190d4b0d 2246 scm_nullstr = scm_i_make_string (0, NULL, 1);
7c33806a 2247
a0599745 2248#include "libguile/strings.x"
0f2d19dd
JB
2249}
2250
89e00824
ML
2251
2252/*
2253 Local Variables:
2254 c-file-style: "gnu"
2255 End:
2256*/