Optimize `peek-char'.
[bpt/guile.git] / libguile / strings.c
CommitLineData
29bcdbb0 1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
0f2d19dd 2 *
73be1d9e 3 * This library is free software; you can redistribute it and/or
53befeb7
NJ
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
0f2d19dd 7 *
53befeb7
NJ
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
73be1d9e
MV
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
0f2d19dd 12 *
73be1d9e
MV
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
53befeb7
NJ
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
73be1d9e 17 */
1bbd0b84 18
1bbd0b84 19
0f2d19dd 20\f
dbb605f5
LC
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
0f2d19dd 24
faf2c9d7 25#include <string.h>
3ee86942 26#include <stdio.h>
9c44cd45 27#include <ctype.h>
edb7bb47 28#include <uninorm.h>
9c44cd45 29#include <unistr.h>
eca29b02 30#include <uniconv.h>
faf2c9d7 31
889975e5
MG
32#include "striconveh.h"
33
a0599745
MD
34#include "libguile/_scm.h"
35#include "libguile/chars.h"
7c33806a 36#include "libguile/root.h"
a0599745 37#include "libguile/strings.h"
a3d7d5d5 38#include "libguile/error.h"
f45eccff 39#include "libguile/generalized-vectors.h"
1afff620 40#include "libguile/deprecation.h"
a0599745 41#include "libguile/validate.h"
d31b9519 42#include "libguile/private-options.h"
1afff620 43
0f2d19dd
JB
44\f
45
46/* {Strings}
47 */
48
3ee86942
MV
49
50/* Stringbufs
51 *
52 * XXX - keeping an accurate refcount during GC seems to be quite
53 * tricky, so we just keep score of whether a stringbuf might be
50b1996f 54 * shared, not whether it definitely is.
3ee86942
MV
55 *
56 * The scheme I (mvo) tried to keep an accurate reference count would
57 * recount all strings that point to a stringbuf during the mark-phase
58 * of the GC. This was done since one cannot access the stringbuf of
59 * a string when that string is freed (in order to decrease the
60 * reference count). The memory of the stringbuf might have been
61 * reused already for something completely different.
62 *
63 * This recounted worked for a small number of threads beating on
64 * cow-strings, but it failed randomly with more than 10 threads, say.
65 * I couldn't figure out what went wrong, so I used the conservative
66 * approach implemented below.
50b1996f 67 *
ba54a202
LC
68 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
69 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
70 * strings.
3ee86942
MV
71 */
72
ba54a202
LC
73/* The size in words of the stringbuf header (type tag + size). */
74#define STRINGBUF_HEADER_SIZE 2U
75
76#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
77
35920c00 78#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
5f236208 79#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
3ee86942
MV
80
81#define STRINGBUF_TAG scm_tc7_stringbuf
82#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
9c44cd45 83#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
3ee86942 84
100e20c7 85#define STRINGBUF_CONTENTS(buf) ((void *) \
ba54a202
LC
86 SCM_CELL_OBJECT_LOC (buf, \
87 STRINGBUF_HEADER_SIZE))
100e20c7
LC
88#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
89#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
3ee86942 90
100e20c7 91#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
3ee86942 92
9b41542f
LC
93#define SET_STRINGBUF_SHARED(buf) \
94 do \
95 { \
96 /* Don't modify BUF if it's already marked as shared since it might be \
97 a read-only, statically allocated stringbuf. */ \
98 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
99 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
100 } \
101 while (0)
3ee86942 102
56a3dcd4 103#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
104static size_t lenhist[1001];
105#endif
106
50b1996f
MG
107/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
108 characters. */
3ee86942
MV
109static SCM
110make_stringbuf (size_t len)
0f2d19dd 111{
3ee86942
MV
112 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
113 scm_i_symbol_chars, all stringbufs are null-terminated. Once
114 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
115 has been changed for scm_i_symbol_chars, this null-termination
116 can be dropped.
117 */
118
ba54a202
LC
119 SCM buf;
120
56a3dcd4 121#ifdef SCM_STRING_LENGTH_HISTOGRAM
3ee86942
MV
122 if (len < 1000)
123 lenhist[len]++;
124 else
125 lenhist[1000]++;
126#endif
0f2d19dd 127
ba54a202
LC
128 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
129 "string"));
130
131 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
132 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
133
134 STRINGBUF_CHARS (buf)[len] = 0;
135
136 return buf;
3ee86942 137}
e53cc817 138
50b1996f
MG
139/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
140 characters. */
9c44cd45
MG
141static SCM
142make_wide_stringbuf (size_t len)
143{
ba54a202
LC
144 SCM buf;
145 size_t raw_len;
146
56a3dcd4 147#ifdef SCM_STRING_LENGTH_HISTOGRAM
9c44cd45
MG
148 if (len < 1000)
149 lenhist[len]++;
150 else
151 lenhist[1000]++;
152#endif
153
ba54a202
LC
154 raw_len = (len + 1) * sizeof (scm_t_wchar);
155 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
156 "string"));
157
158 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
159 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
160
161 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
162
163 return buf;
9c44cd45
MG
164}
165
ba54a202
LC
166/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
167 characters from BUF. */
168static SCM
169wide_stringbuf (SCM buf)
9c44cd45 170{
ba54a202 171 SCM new_buf;
9c44cd45
MG
172
173 if (STRINGBUF_WIDE (buf))
ba54a202
LC
174 new_buf = buf;
175 else
9c44cd45 176 {
ba54a202
LC
177 size_t i, len;
178 scm_t_wchar *mem;
9c44cd45 179
ba54a202 180 len = STRINGBUF_LENGTH (buf);
9c44cd45 181
ba54a202 182 new_buf = make_wide_stringbuf (len);
9c44cd45 183
ba54a202 184 mem = STRINGBUF_WIDE_CHARS (new_buf);
9c44cd45 185 for (i = 0; i < len; i++)
ba54a202 186 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
9c44cd45 187 mem[len] = 0;
9c44cd45 188 }
ba54a202
LC
189
190 return new_buf;
3ee86942 191}
bd9e24b3 192
ba54a202
LC
193/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
194 characters from BUF, if possible. */
195static SCM
587a3355
MG
196narrow_stringbuf (SCM buf)
197{
ba54a202 198 SCM new_buf;
587a3355
MG
199
200 if (!STRINGBUF_WIDE (buf))
ba54a202
LC
201 new_buf = buf;
202 else
203 {
204 size_t i, len;
205 scm_t_wchar *wmem;
206 unsigned char *mem;
587a3355 207
ba54a202
LC
208 len = STRINGBUF_LENGTH (buf);
209 wmem = STRINGBUF_WIDE_CHARS (buf);
587a3355 210
ba54a202
LC
211 for (i = 0; i < len; i++)
212 if (wmem[i] > 0xFF)
213 /* BUF cannot be narrowed. */
214 return buf;
587a3355 215
ba54a202 216 new_buf = make_stringbuf (len);
587a3355 217
ba54a202
LC
218 mem = STRINGBUF_CHARS (new_buf);
219 for (i = 0; i < len; i++)
220 mem[i] = (unsigned char) wmem[i];
221 mem[len] = 0;
222 }
223
224 return new_buf;
587a3355
MG
225}
226
9de87eea 227scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
bd9e24b3 228
ba54a202 229\f
3ee86942
MV
230/* Copy-on-write strings.
231 */
bd9e24b3 232
3ee86942 233#define STRING_TAG scm_tc7_string
bd9e24b3 234
3ee86942
MV
235#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
236#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
237#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
bd9e24b3 238
3ee86942
MV
239#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
240#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
241
242#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
243
ed35de72
MV
244/* Read-only strings.
245 */
246
35920c00 247#define RO_STRING_TAG scm_tc7_ro_string
ed35de72
MV
248#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
249
e1b29f6a
MV
250/* Mutation-sharing substrings
251 */
252
253#define SH_STRING_TAG (scm_tc7_string + 0x100)
254
255#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
256/* START and LENGTH as for STRINGs. */
257
258#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
259
e7efe8e7
AW
260SCM scm_nullstr;
261
50b1996f
MG
262/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
263 characters. CHARSP, if not NULL, will be set to location of the
264 char array. */
3ee86942
MV
265SCM
266scm_i_make_string (size_t len, char **charsp)
267{
268 SCM buf = make_stringbuf (len);
269 SCM res;
270 if (charsp)
f59cf998 271 *charsp = (char *) STRINGBUF_CHARS (buf);
3ee86942
MV
272 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
273 (scm_t_bits)0, (scm_t_bits) len);
274 return res;
0f2d19dd
JB
275}
276
50b1996f
MG
277/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
278 characters. CHARSP, if not NULL, will be set to location of the
279 character array. */
9c44cd45 280SCM
50b1996f 281scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
9c44cd45
MG
282{
283 SCM buf = make_wide_stringbuf (len);
284 SCM res;
285 if (charsp)
286 *charsp = STRINGBUF_WIDE_CHARS (buf);
287 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
288 (scm_t_bits) 0, (scm_t_bits) len);
289 return res;
290}
291
3ee86942
MV
292static void
293validate_substring_args (SCM str, size_t start, size_t end)
294{
295 if (!IS_STRING (str))
296 scm_wrong_type_arg_msg (NULL, 0, str, "string");
297 if (start > STRING_LENGTH (str))
298 scm_out_of_range (NULL, scm_from_size_t (start));
299 if (end > STRING_LENGTH (str) || end < start)
300 scm_out_of_range (NULL, scm_from_size_t (end));
301}
0f2d19dd 302
e1b29f6a
MV
303static inline void
304get_str_buf_start (SCM *str, SCM *buf, size_t *start)
305{
306 *start = STRING_START (*str);
307 if (IS_SH_STRING (*str))
308 {
309 *str = SH_STRING_STRING (*str);
310 *start += STRING_START (*str);
311 }
312 *buf = STRING_STRINGBUF (*str);
313}
314
3ee86942
MV
315SCM
316scm_i_substring (SCM str, size_t start, size_t end)
0f2d19dd 317{
e1b29f6a
MV
318 SCM buf;
319 size_t str_start;
320 get_str_buf_start (&str, &buf, &str_start);
9de87eea 321 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 322 SET_STRINGBUF_SHARED (buf);
9de87eea 323 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 324 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
e1b29f6a
MV
325 (scm_t_bits)str_start + start,
326 (scm_t_bits) end - start);
0f2d19dd
JB
327}
328
ed35de72
MV
329SCM
330scm_i_substring_read_only (SCM str, size_t start, size_t end)
331{
45a9f430
LC
332 SCM buf;
333 size_t str_start;
334 get_str_buf_start (&str, &buf, &str_start);
335 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
336 SET_STRINGBUF_SHARED (buf);
337 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
338 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
339 (scm_t_bits)str_start + start,
340 (scm_t_bits) end - start);
ed35de72
MV
341}
342
3ee86942
MV
343SCM
344scm_i_substring_copy (SCM str, size_t start, size_t end)
345{
346 size_t len = end - start;
edea856c 347 SCM buf, my_buf;
e1b29f6a
MV
348 size_t str_start;
349 get_str_buf_start (&str, &buf, &str_start);
9c44cd45
MG
350 if (scm_i_is_narrow_string (str))
351 {
352 my_buf = make_stringbuf (len);
353 memcpy (STRINGBUF_CHARS (my_buf),
354 STRINGBUF_CHARS (buf) + str_start + start, len);
355 }
356 else
357 {
358 my_buf = make_wide_stringbuf (len);
359 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
360 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
361 + start), len);
362 /* Even though this string is wide, the substring may be narrow.
50b1996f 363 Consider adding code to narrow the string. */
9c44cd45 364 }
3ee86942 365 scm_remember_upto_here_1 (buf);
9c44cd45
MG
366 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
367 (scm_t_bits) 0, (scm_t_bits) len);
3ee86942 368}
0f2d19dd 369
e1b29f6a
MV
370SCM
371scm_i_substring_shared (SCM str, size_t start, size_t end)
372{
373 if (start == 0 && end == STRING_LENGTH (str))
374 return str;
375 else
376 {
377 size_t len = end - start;
378 if (IS_SH_STRING (str))
379 {
380 start += STRING_START (str);
381 str = SH_STRING_STRING (str);
382 }
383 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
384 (scm_t_bits)start, (scm_t_bits) len);
385 }
386}
387
3ee86942
MV
388SCM
389scm_c_substring (SCM str, size_t start, size_t end)
390{
391 validate_substring_args (str, start, end);
392 return scm_i_substring (str, start, end);
393}
ee149d03 394
ed35de72
MV
395SCM
396scm_c_substring_read_only (SCM str, size_t start, size_t end)
397{
398 validate_substring_args (str, start, end);
399 return scm_i_substring_read_only (str, start, end);
400}
401
0f2d19dd 402SCM
3ee86942 403scm_c_substring_copy (SCM str, size_t start, size_t end)
0f2d19dd 404{
3ee86942
MV
405 validate_substring_args (str, start, end);
406 return scm_i_substring_copy (str, start, end);
407}
408
3ee86942
MV
409SCM
410scm_c_substring_shared (SCM str, size_t start, size_t end)
411{
412 validate_substring_args (str, start, end);
413 return scm_i_substring_shared (str, start, end);
414}
0f2d19dd 415
d6c74168 416\f
3ee86942
MV
417/* Internal accessors
418 */
419
50b1996f
MG
420/* Returns the number of characters in STR. This may be different
421 than the memory size of the string storage. */
3ee86942
MV
422size_t
423scm_i_string_length (SCM str)
0f2d19dd 424{
3ee86942 425 return STRING_LENGTH (str);
0f2d19dd
JB
426}
427
50b1996f
MG
428/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
429 encoding. False if it is 'wide', having a 32-bit UCS-4
430 encoding. */
9c44cd45
MG
431int
432scm_i_is_narrow_string (SCM str)
433{
434 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
435}
436
587a3355
MG
437/* Try to coerce a string to be narrow. It if is narrow already, do
438 nothing. If it is wide, shrink it to narrow if none of its
439 characters are above 0xFF. Return true if the string is narrow or
440 was made to be narrow. */
441int
442scm_i_try_narrow_string (SCM str)
443{
ba54a202 444 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
587a3355
MG
445
446 return scm_i_is_narrow_string (str);
447}
448
100e20c7
LC
449/* Return a pointer to the raw data of the string, which can be either Latin-1
450 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
451const void *
452scm_i_string_data (SCM str)
453{
454 SCM buf;
455 size_t start;
456 const char *data;
457
458 get_str_buf_start (&str, &buf, &start);
459
460 data = STRINGBUF_CONTENTS (buf);
461 data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
462
463 return data;
464}
465
50b1996f
MG
466/* Returns a pointer to the 8-bit Latin-1 encoded character array of
467 STR. */
3ee86942
MV
468const char *
469scm_i_string_chars (SCM str)
470{
471 SCM buf;
e1b29f6a
MV
472 size_t start;
473 get_str_buf_start (&str, &buf, &start);
9c44cd45 474 if (scm_i_is_narrow_string (str))
f59cf998 475 return (const char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
476 else
477 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
478 scm_list_1 (str));
479 return NULL;
3ee86942 480}
b00418df 481
50b1996f
MG
482/* Returns a pointer to the 32-bit UCS-4 encoded character array of
483 STR. */
9c44cd45
MG
484const scm_t_wchar *
485scm_i_string_wide_chars (SCM str)
486{
487 SCM buf;
488 size_t start;
489
490 get_str_buf_start (&str, &buf, &start);
491 if (!scm_i_is_narrow_string (str))
f59cf998 492 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
9c44cd45
MG
493 else
494 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
495 scm_list_1 (str));
496}
497
498/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
499 a new string buffer, so that it can be modified without modifying
50b1996f
MG
500 other strings. Also, lock the string mutex. Later, one must call
501 scm_i_string_stop_writing to unlock the mutex. */
9c44cd45
MG
502SCM
503scm_i_string_start_writing (SCM orig_str)
b00418df 504{
ed35de72 505 SCM buf, str = orig_str;
e1b29f6a 506 size_t start;
ed35de72 507
e1b29f6a 508 get_str_buf_start (&str, &buf, &start);
ed35de72
MV
509 if (IS_RO_STRING (str))
510 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
511
9de87eea 512 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942
MV
513 if (STRINGBUF_SHARED (buf))
514 {
9c44cd45 515 /* Clone the stringbuf. */
3ee86942
MV
516 size_t len = STRING_LENGTH (str);
517 SCM new_buf;
518
9de87eea 519 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 520
9c44cd45
MG
521 if (scm_i_is_narrow_string (str))
522 {
523 new_buf = make_stringbuf (len);
524 memcpy (STRINGBUF_CHARS (new_buf),
525 STRINGBUF_CHARS (buf) + STRING_START (str), len);
526
527 }
528 else
529 {
530 new_buf = make_wide_stringbuf (len);
531 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
532 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
533 + STRING_START (str)), len);
534 }
3ee86942 535
3ee86942
MV
536 SET_STRING_STRINGBUF (str, new_buf);
537 start -= STRING_START (str);
902578f1
LC
538
539 /* FIXME: The following operations are not atomic, so other threads
540 looking at STR may see an inconsistent state. Nevertheless it can't
541 hurt much since (i) accessing STR while it is being mutated can't
542 yield a crash, and (ii) concurrent accesses to STR should be
543 protected by a mutex at the application level. The latter may not
544 apply when STR != ORIG_STR, though. */
3ee86942 545 SET_STRING_START (str, 0);
902578f1 546 SET_STRING_STRINGBUF (str, new_buf);
3ee86942
MV
547
548 buf = new_buf;
549
9de87eea 550 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 551 }
9c44cd45
MG
552 return orig_str;
553}
3ee86942 554
50b1996f 555/* Return a pointer to the 8-bit Latin-1 chars of a string. */
9c44cd45
MG
556char *
557scm_i_string_writable_chars (SCM str)
558{
559 SCM buf;
560 size_t start;
3ee86942 561
9c44cd45
MG
562 get_str_buf_start (&str, &buf, &start);
563 if (scm_i_is_narrow_string (str))
f59cf998 564 return (char *) STRINGBUF_CHARS (buf) + start;
9c44cd45
MG
565 else
566 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
567 scm_list_1 (str));
568 return NULL;
b00418df
DH
569}
570
50b1996f 571/* Return a pointer to the UCS-4 codepoints of a string. */
9c44cd45
MG
572static scm_t_wchar *
573scm_i_string_writable_wide_chars (SCM str)
574{
575 SCM buf;
576 size_t start;
577
578 get_str_buf_start (&str, &buf, &start);
579 if (!scm_i_is_narrow_string (str))
580 return STRINGBUF_WIDE_CHARS (buf) + start;
581 else
1c7b216f 582 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
9c44cd45 583 scm_list_1 (str));
b00418df
DH
584}
585
50b1996f
MG
586/* Unlock the string mutex that was locked when
587 scm_i_string_start_writing was called. */
3ee86942
MV
588void
589scm_i_string_stop_writing (void)
590{
9de87eea 591 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942 592}
b00418df 593
50b1996f 594/* Return the Xth character of STR as a UCS-4 codepoint. */
9c44cd45
MG
595scm_t_wchar
596scm_i_string_ref (SCM str, size_t x)
597{
598 if (scm_i_is_narrow_string (str))
599 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
600 else
601 return scm_i_string_wide_chars (str)[x];
602}
603
889975e5
MG
604/* Returns index+1 of the first char in STR that matches C, or
605 0 if the char is not found. */
606int
607scm_i_string_contains_char (SCM str, char ch)
608{
609 size_t i;
610 size_t len = scm_i_string_length (str);
611
612 i = 0;
613 if (scm_i_is_narrow_string (str))
614 {
615 while (i < len)
616 {
617 if (scm_i_string_chars (str)[i] == ch)
618 return i+1;
619 i++;
620 }
621 }
622 else
623 {
624 while (i < len)
625 {
626 if (scm_i_string_wide_chars (str)[i]
627 == (unsigned char) ch)
628 return i+1;
629 i++;
630 }
631 }
632 return 0;
633}
634
3f47e526
MG
635int
636scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
637{
638 if (scm_i_is_narrow_string (sstr))
639 {
640 const char *a = scm_i_string_chars (sstr) + start_x;
641 const char *b = cstr;
642 return strncmp (a, b, strlen(b));
643 }
644 else
645 {
646 size_t i;
647 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
648 const char *b = cstr;
649 for (i = 0; i < strlen (b); i++)
650 {
651 if (a[i] != (unsigned char) b[i])
652 return 1;
653 }
654 }
655 return 0;
656}
657
50b1996f 658/* Set the Pth character of STR to UCS-4 codepoint CHR. */
9c44cd45
MG
659void
660scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
661{
662 if (chr > 0xFF && scm_i_is_narrow_string (str))
ba54a202 663 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
9c44cd45
MG
664
665 if (scm_i_is_narrow_string (str))
666 {
667 char *dst = scm_i_string_writable_chars (str);
587a3355 668 dst[p] = chr;
9c44cd45
MG
669 }
670 else
671 {
672 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
673 dst[p] = chr;
674 }
675}
676
ba54a202 677\f
3ee86942 678/* Symbols.
587a3355 679
3ee86942
MV
680 Basic symbol creation and accessing is done here, the rest is in
681 symbols.[hc]. This has been done to keep stringbufs and the
682 internals of strings and string-like objects confined to this file.
683*/
684
685#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
686
687SCM
6869328b
MV
688scm_i_make_symbol (SCM name, scm_t_bits flags,
689 unsigned long hash, SCM props)
3ee86942
MV
690{
691 SCM buf;
692 size_t start = STRING_START (name);
693 size_t length = STRING_LENGTH (name);
694
695 if (IS_SH_STRING (name))
696 {
697 name = SH_STRING_STRING (name);
698 start += STRING_START (name);
699 }
700 buf = SYMBOL_STRINGBUF (name);
701
702 if (start == 0 && length == STRINGBUF_LENGTH (buf))
703 {
704 /* reuse buf. */
9de87eea 705 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 706 SET_STRINGBUF_SHARED (buf);
9de87eea 707 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
3ee86942
MV
708 }
709 else
710 {
711 /* make new buf. */
9c44cd45
MG
712 if (scm_i_is_narrow_string (name))
713 {
714 SCM new_buf = make_stringbuf (length);
715 memcpy (STRINGBUF_CHARS (new_buf),
716 STRINGBUF_CHARS (buf) + start, length);
717 buf = new_buf;
718 }
719 else
720 {
721 SCM new_buf = make_wide_stringbuf (length);
722 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
723 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
724 length);
725 buf = new_buf;
726 }
3ee86942 727 }
6869328b 728 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
3ee86942
MV
729 (scm_t_bits) hash, SCM_UNPACK (props));
730}
731
fd0a5bbc
HWN
732SCM
733scm_i_c_make_symbol (const char *name, size_t len,
734 scm_t_bits flags, unsigned long hash, SCM props)
735{
736 SCM buf = make_stringbuf (len);
737 memcpy (STRINGBUF_CHARS (buf), name, len);
738
a284cc7e
LC
739 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
740 (scm_t_bits) hash, SCM_UNPACK (props));
fd0a5bbc
HWN
741}
742
50b1996f
MG
743/* Returns the number of characters in SYM. This may be different
744 from the memory size of SYM. */
3ee86942
MV
745size_t
746scm_i_symbol_length (SCM sym)
0f2d19dd 747{
3ee86942 748 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
0f2d19dd
JB
749}
750
071bb6a8
LC
751size_t
752scm_c_symbol_length (SCM sym)
753#define FUNC_NAME "scm_c_symbol_length"
754{
755 SCM_VALIDATE_SYMBOL (1, sym);
756
757 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
758}
759#undef FUNC_NAME
760
50b1996f
MG
761/* True if the name of SYM is stored as a Latin-1 encoded string.
762 False if it is stored as a 32-bit UCS-4-encoded string. */
9c44cd45
MG
763int
764scm_i_is_narrow_symbol (SCM sym)
765{
766 SCM buf;
767
768 buf = SYMBOL_STRINGBUF (sym);
769 return !STRINGBUF_WIDE (buf);
770}
771
50b1996f
MG
772/* Returns a pointer to the 8-bit Latin-1 encoded character array that
773 contains the name of SYM. */
3ee86942
MV
774const char *
775scm_i_symbol_chars (SCM sym)
776{
9c44cd45
MG
777 SCM buf;
778
779 buf = SYMBOL_STRINGBUF (sym);
780 if (!STRINGBUF_WIDE (buf))
f59cf998 781 return (const char *) STRINGBUF_CHARS (buf);
9c44cd45
MG
782 else
783 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
784 scm_list_1 (sym));
785}
786
50b1996f
MG
787/* Return a pointer to the 32-bit UCS-4-encoded character array of a
788 symbol's name. */
9c44cd45
MG
789const scm_t_wchar *
790scm_i_symbol_wide_chars (SCM sym)
791{
792 SCM buf;
793
794 buf = SYMBOL_STRINGBUF (sym);
795 if (STRINGBUF_WIDE (buf))
f59cf998 796 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
9c44cd45
MG
797 else
798 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
799 scm_list_1 (sym));
3ee86942 800}
1cc91f1b 801
be54b15d 802SCM
3ee86942 803scm_i_symbol_substring (SCM sym, size_t start, size_t end)
be54b15d 804{
3ee86942 805 SCM buf = SYMBOL_STRINGBUF (sym);
9de87eea 806 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
3ee86942 807 SET_STRINGBUF_SHARED (buf);
9de87eea 808 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
fd2b17b9 809 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
3ee86942
MV
810 (scm_t_bits)start, (scm_t_bits) end - start);
811}
be54b15d 812
50b1996f 813/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
9c44cd45
MG
814scm_t_wchar
815scm_i_symbol_ref (SCM sym, size_t x)
816{
817 if (scm_i_is_narrow_symbol (sym))
818 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
819 else
820 return scm_i_symbol_wide_chars (sym)[x];
821}
822
3ee86942
MV
823/* Debugging
824 */
be54b15d 825
6ce6923b
MG
826SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
827 "Returns an association list containing debugging information\n"
828 "for @var{str}. The association list has the following entries."
829 "@table @code\n"
830 "@item string\n"
831 "The string itself.\n"
832 "@item start\n"
833 "The start index of the string into its stringbuf\n"
834 "@item length\n"
835 "The length of the string\n"
836 "@item shared\n"
837 "If this string is a substring, it returns its parent string.\n"
838 "Otherwise, it returns @code{#f}\n"
88ed5759
MG
839 "@item read-only\n"
840 "@code{#t} if the string is read-only\n"
6ce6923b
MG
841 "@item stringbuf-chars\n"
842 "A new string containing this string's stringbuf's characters\n"
843 "@item stringbuf-length\n"
844 "The number of characters in this stringbuf\n"
845 "@item stringbuf-shared\n"
846 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
847 "@item stringbuf-wide\n"
848 "@code{#t} if this stringbuf's characters are stored in a\n"
849 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
850 "buffer\n"
851 "@end table")
3ee86942
MV
852#define FUNC_NAME s_scm_sys_string_dump
853{
ba54a202 854 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
6ce6923b 855 SCM buf;
3ee86942 856 SCM_VALIDATE_STRING (1, str);
6ce6923b
MG
857
858 /* String info */
859 e1 = scm_cons (scm_from_locale_symbol ("string"),
860 str);
861 e2 = scm_cons (scm_from_locale_symbol ("start"),
862 scm_from_size_t (STRING_START (str)));
863 e3 = scm_cons (scm_from_locale_symbol ("length"),
864 scm_from_size_t (STRING_LENGTH (str)));
865
3ee86942
MV
866 if (IS_SH_STRING (str))
867 {
6ce6923b
MG
868 e4 = scm_cons (scm_from_locale_symbol ("shared"),
869 SH_STRING_STRING (str));
870 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
3ee86942
MV
871 }
872 else
873 {
6ce6923b
MG
874 e4 = scm_cons (scm_from_locale_symbol ("shared"),
875 SCM_BOOL_F);
876 buf = STRING_STRINGBUF (str);
3ee86942 877 }
9c44cd45 878
88ed5759
MG
879 if (IS_RO_STRING (str))
880 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
881 SCM_BOOL_T);
882 else
883 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
884 SCM_BOOL_F);
587a3355 885
6ce6923b 886 /* Stringbuf info */
6ce6923b
MG
887 if (!STRINGBUF_WIDE (buf))
888 {
889 size_t len = STRINGBUF_LENGTH (buf);
890 char *cbuf;
891 SCM sbc = scm_i_make_string (len, &cbuf);
892 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
893 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
894 sbc);
3ee86942 895 }
6ce6923b
MG
896 else
897 {
898 size_t len = STRINGBUF_LENGTH (buf);
899 scm_t_wchar *cbuf;
900 SCM sbc = scm_i_make_wide_string (len, &cbuf);
901 u32_cpy ((scm_t_uint32 *) cbuf,
902 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
903 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
904 sbc);
905 }
906 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
907 scm_from_size_t (STRINGBUF_LENGTH (buf)));
908 if (STRINGBUF_SHARED (buf))
909 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
910 SCM_BOOL_T);
911 else
912 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
913 SCM_BOOL_F);
6ce6923b 914 if (STRINGBUF_WIDE (buf))
ba54a202
LC
915 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
916 SCM_BOOL_T);
6ce6923b 917 else
ba54a202
LC
918 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
919 SCM_BOOL_F);
6ce6923b 920
ba54a202 921 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
3ee86942
MV
922}
923#undef FUNC_NAME
924
6ce6923b
MG
925SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
926 "Returns an association list containing debugging information\n"
927 "for @var{sym}. The association list has the following entries."
928 "@table @code\n"
929 "@item symbol\n"
930 "The symbol itself\n"
931 "@item hash\n"
932 "Its hash value\n"
88ed5759
MG
933 "@item interned\n"
934 "@code{#t} if it is an interned symbol\n"
6ce6923b
MG
935 "@item stringbuf-chars\n"
936 "A new string containing this symbols's stringbuf's characters\n"
937 "@item stringbuf-length\n"
938 "The number of characters in this stringbuf\n"
939 "@item stringbuf-shared\n"
940 "@code{#t} if this stringbuf is shared\n"
6ce6923b
MG
941 "@item stringbuf-wide\n"
942 "@code{#t} if this stringbuf's characters are stored in a\n"
943 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
944 "buffer\n"
945 "@end table")
3ee86942
MV
946#define FUNC_NAME s_scm_sys_symbol_dump
947{
ba54a202 948 SCM e1, e2, e3, e4, e5, e6, e7;
6ce6923b 949 SCM buf;
3ee86942 950 SCM_VALIDATE_SYMBOL (1, sym);
6ce6923b
MG
951 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
952 sym);
953 e2 = scm_cons (scm_from_locale_symbol ("hash"),
954 scm_from_ulong (scm_i_symbol_hash (sym)));
88ed5759
MG
955 e3 = scm_cons (scm_from_locale_symbol ("interned"),
956 scm_symbol_interned_p (sym));
6ce6923b
MG
957 buf = SYMBOL_STRINGBUF (sym);
958
959 /* Stringbuf info */
6ce6923b
MG
960 if (!STRINGBUF_WIDE (buf))
961 {
962 size_t len = STRINGBUF_LENGTH (buf);
963 char *cbuf;
964 SCM sbc = scm_i_make_string (len, &cbuf);
965 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
966 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
967 sbc);
968 }
9c44cd45 969 else
6ce6923b
MG
970 {
971 size_t len = STRINGBUF_LENGTH (buf);
972 scm_t_wchar *cbuf;
973 SCM sbc = scm_i_make_wide_string (len, &cbuf);
974 u32_cpy ((scm_t_uint32 *) cbuf,
975 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
976 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
977 sbc);
978 }
979 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
980 scm_from_size_t (STRINGBUF_LENGTH (buf)));
981 if (STRINGBUF_SHARED (buf))
982 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
983 SCM_BOOL_T);
984 else
985 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
986 SCM_BOOL_F);
6ce6923b 987 if (STRINGBUF_WIDE (buf))
ba54a202 988 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b
MG
989 SCM_BOOL_T);
990 else
ba54a202 991 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
6ce6923b 992 SCM_BOOL_F);
ba54a202 993 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
6ce6923b 994
3ee86942
MV
995}
996#undef FUNC_NAME
997
56a3dcd4 998#ifdef SCM_STRING_LENGTH_HISTOGRAM
6ce6923b 999
9c44cd45 1000SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
e1b29f6a 1001#define FUNC_NAME s_scm_sys_stringbuf_hist
3ee86942
MV
1002{
1003 int i;
1004 for (i = 0; i < 1000; i++)
1005 if (lenhist[i])
1006 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
1007 fprintf (stderr, ">999: %u\n", lenhist[1000]);
1008 return SCM_UNSPECIFIED;
be54b15d
DH
1009}
1010#undef FUNC_NAME
1011
3ee86942
MV
1012#endif
1013
1014\f
1015
1016SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
1017 (SCM obj),
1018 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1019#define FUNC_NAME s_scm_string_p
1020{
1021 return scm_from_bool (IS_STRING (obj));
1022}
1023#undef FUNC_NAME
1024
1025
1026SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1027
1028SCM_DEFINE (scm_string, "string", 0, 0, 1,
1029 (SCM chrs),
1030 "@deffnx {Scheme Procedure} list->string chrs\n"
1031 "Return a newly allocated string composed of the arguments,\n"
1032 "@var{chrs}.")
1033#define FUNC_NAME s_scm_string
1034{
9aa27c1a 1035 SCM result = SCM_BOOL_F;
9c44cd45 1036 SCM rest;
3ee86942 1037 size_t len;
9c44cd45
MG
1038 size_t p = 0;
1039 long i;
9aa27c1a 1040 int wide = 0;
3ee86942 1041
9c44cd45
MG
1042 /* Verify that this is a list of chars. */
1043 i = scm_ilength (chrs);
3c7cf7f5 1044 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
3ee86942 1045
9c44cd45
MG
1046 len = (size_t) i;
1047 rest = chrs;
3ee86942 1048
9c44cd45 1049 while (len > 0 && scm_is_pair (rest))
3ee86942 1050 {
9c44cd45 1051 SCM elt = SCM_CAR (rest);
3ee86942 1052 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
9aa27c1a
MG
1053 if (SCM_CHAR (elt) > 0xFF)
1054 wide = 1;
9c44cd45
MG
1055 rest = SCM_CDR (rest);
1056 len--;
1057 scm_remember_upto_here_1 (elt);
1058 }
1059
1060 /* Construct a string containing this list of chars. */
1061 len = (size_t) i;
1062 rest = chrs;
1063
9aa27c1a 1064 if (wide == 0)
9c44cd45 1065 {
56a3dcd4
LC
1066 char *buf;
1067
9aa27c1a
MG
1068 result = scm_i_make_string (len, NULL);
1069 result = scm_i_string_start_writing (result);
56a3dcd4 1070 buf = scm_i_string_writable_chars (result);
9aa27c1a
MG
1071 while (len > 0 && scm_is_pair (rest))
1072 {
1073 SCM elt = SCM_CAR (rest);
1074 buf[p] = (unsigned char) SCM_CHAR (elt);
1075 p++;
1076 rest = SCM_CDR (rest);
1077 len--;
1078 scm_remember_upto_here_1 (elt);
1079 }
1080 }
1081 else
1082 {
56a3dcd4
LC
1083 scm_t_wchar *buf;
1084
9aa27c1a
MG
1085 result = scm_i_make_wide_string (len, NULL);
1086 result = scm_i_string_start_writing (result);
56a3dcd4 1087 buf = scm_i_string_writable_wide_chars (result);
9aa27c1a
MG
1088 while (len > 0 && scm_is_pair (rest))
1089 {
1090 SCM elt = SCM_CAR (rest);
1091 buf[p] = SCM_CHAR (elt);
1092 p++;
1093 rest = SCM_CDR (rest);
1094 len--;
1095 scm_remember_upto_here_1 (elt);
1096 }
3ee86942 1097 }
9c44cd45
MG
1098 scm_i_string_stop_writing ();
1099
3ee86942
MV
1100 if (len > 0)
1101 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
9c44cd45 1102 if (!scm_is_null (rest))
3ee86942
MV
1103 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1104
1105 return result;
1106}
1107#undef FUNC_NAME
be54b15d 1108
3b3b36dd 1109SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
6fa73e72 1110 (SCM k, SCM chr),
0d26a824
MG
1111 "Return a newly allocated string of\n"
1112 "length @var{k}. If @var{chr} is given, then all elements of\n"
1113 "the string are initialized to @var{chr}, otherwise the contents\n"
9401323e 1114 "of the @var{string} are unspecified.")
1bbd0b84 1115#define FUNC_NAME s_scm_make_string
0f2d19dd 1116{
3ee86942
MV
1117 return scm_c_make_string (scm_to_size_t (k), chr);
1118}
1119#undef FUNC_NAME
1120
1121SCM
1122scm_c_make_string (size_t len, SCM chr)
1123#define FUNC_NAME NULL
1124{
9c44cd45
MG
1125 size_t p;
1126 SCM res = scm_i_make_string (len, NULL);
cb0d8be2 1127
e11e83f3
MV
1128 if (!SCM_UNBNDP (chr))
1129 {
3ee86942 1130 SCM_VALIDATE_CHAR (0, chr);
9c44cd45
MG
1131 res = scm_i_string_start_writing (res);
1132 for (p = 0; p < len; p++)
1133 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1134 scm_i_string_stop_writing ();
0f2d19dd 1135 }
e11e83f3
MV
1136
1137 return res;
0f2d19dd 1138}
1bbd0b84 1139#undef FUNC_NAME
0f2d19dd 1140
3b3b36dd 1141SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
0d26a824
MG
1142 (SCM string),
1143 "Return the number of characters in @var{string}.")
1bbd0b84 1144#define FUNC_NAME s_scm_string_length
0f2d19dd 1145{
d1ca2c64 1146 SCM_VALIDATE_STRING (1, string);
3ee86942 1147 return scm_from_size_t (STRING_LENGTH (string));
0f2d19dd 1148}
1bbd0b84 1149#undef FUNC_NAME
0f2d19dd 1150
f8ba2bb9 1151SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
9c44cd45
MG
1152 (SCM string),
1153 "Return the bytes used to represent a character in @var{string}."
1154 "This will return 1 or 4.")
f8ba2bb9 1155#define FUNC_NAME s_scm_string_bytes_per_char
9c44cd45
MG
1156{
1157 SCM_VALIDATE_STRING (1, string);
1158 if (!scm_i_is_narrow_string (string))
1159 return scm_from_int (4);
1160
1161 return scm_from_int (1);
1162}
1163#undef FUNC_NAME
1164
3ee86942
MV
1165size_t
1166scm_c_string_length (SCM string)
1167{
1168 if (!IS_STRING (string))
1169 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1170 return STRING_LENGTH (string);
1171}
1172
bd9e24b3 1173SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
6fa73e72 1174 (SCM str, SCM k),
9c44cd45
MG
1175 "Return character @var{k} of @var{str} using zero-origin\n"
1176 "indexing. @var{k} must be a valid index of @var{str}.")
1bbd0b84 1177#define FUNC_NAME s_scm_string_ref
0f2d19dd 1178{
3ae3166b 1179 size_t len;
a55c2b68 1180 unsigned long idx;
bd9e24b3 1181
d1ca2c64 1182 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1183
1184 len = scm_i_string_length (str);
1185 if (SCM_LIKELY (len > 0))
1186 idx = scm_to_unsigned_integer (k, 0, len - 1);
1187 else
1188 scm_out_of_range (NULL, k);
1189
9c44cd45
MG
1190 if (scm_i_is_narrow_string (str))
1191 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1192 else
1193 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
0f2d19dd 1194}
1bbd0b84 1195#undef FUNC_NAME
0f2d19dd 1196
3ee86942
MV
1197SCM
1198scm_c_string_ref (SCM str, size_t p)
1199{
1200 if (p >= scm_i_string_length (str))
1201 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1202 if (scm_i_is_narrow_string (str))
1203 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1204 else
1205 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1206
3ee86942 1207}
f0942910 1208
3b3b36dd 1209SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
6fa73e72 1210 (SCM str, SCM k, SCM chr),
9c44cd45
MG
1211 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1212 "an unspecified value. @var{k} must be a valid index of\n"
1213 "@var{str}.")
1bbd0b84 1214#define FUNC_NAME s_scm_string_set_x
0f2d19dd 1215{
3ae3166b 1216 size_t len;
a55c2b68
MV
1217 unsigned long idx;
1218
f0942910 1219 SCM_VALIDATE_STRING (1, str);
3ae3166b
LC
1220
1221 len = scm_i_string_length (str);
1222 if (SCM_LIKELY (len > 0))
1223 idx = scm_to_unsigned_integer (k, 0, len - 1);
1224 else
1225 scm_out_of_range (NULL, k);
1226
34d19ef6 1227 SCM_VALIDATE_CHAR (3, chr);
9c44cd45
MG
1228 str = scm_i_string_start_writing (str);
1229 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1230 scm_i_string_stop_writing ();
1231
0f2d19dd
JB
1232 return SCM_UNSPECIFIED;
1233}
1bbd0b84 1234#undef FUNC_NAME
0f2d19dd 1235
3ee86942
MV
1236void
1237scm_c_string_set_x (SCM str, size_t p, SCM chr)
1238{
1239 if (p >= scm_i_string_length (str))
1240 scm_out_of_range (NULL, scm_from_size_t (p));
9c44cd45
MG
1241 str = scm_i_string_start_writing (str);
1242 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1243 scm_i_string_stop_writing ();
3ee86942 1244}
0f2d19dd 1245
3b3b36dd 1246SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
0d26a824
MG
1247 (SCM str, SCM start, SCM end),
1248 "Return a newly allocated string formed from the characters\n"
1249 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1250 "ending with index @var{end} (exclusive).\n"
1251 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1252 "exact integers satisfying:\n\n"
1253 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1bbd0b84 1254#define FUNC_NAME s_scm_substring
0f2d19dd 1255{
3ee86942 1256 size_t len, from, to;
685c0d71 1257
d1ca2c64 1258 SCM_VALIDATE_STRING (1, str);
3ee86942
MV
1259 len = scm_i_string_length (str);
1260 from = scm_to_unsigned_integer (start, 0, len);
a55c2b68 1261 if (SCM_UNBNDP (end))
3ee86942 1262 to = len;
a55c2b68 1263 else
3ee86942
MV
1264 to = scm_to_unsigned_integer (end, from, len);
1265 return scm_i_substring (str, from, to);
0f2d19dd 1266}
1bbd0b84 1267#undef FUNC_NAME
0f2d19dd 1268
ed35de72
MV
1269SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1270 (SCM str, SCM start, SCM end),
1271 "Return a newly allocated string formed from the characters\n"
1272 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1273 "ending with index @var{end} (exclusive).\n"
1274 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1275 "exact integers satisfying:\n"
1276 "\n"
1277 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1278 "\n"
1279 "The returned string is read-only.\n")
1280#define FUNC_NAME s_scm_substring_read_only
1281{
1282 size_t len, from, to;
1283
1284 SCM_VALIDATE_STRING (1, str);
1285 len = scm_i_string_length (str);
1286 from = scm_to_unsigned_integer (start, 0, len);
1287 if (SCM_UNBNDP (end))
1288 to = len;
1289 else
1290 to = scm_to_unsigned_integer (end, from, len);
1291 return scm_i_substring_read_only (str, from, to);
1292}
1293#undef FUNC_NAME
1294
3ee86942
MV
1295SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1296 (SCM str, SCM start, SCM end),
1297 "Return a newly allocated string formed from the characters\n"
1298 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1299 "ending with index @var{end} (exclusive).\n"
1300 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1301 "exact integers satisfying:\n\n"
1302 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1303#define FUNC_NAME s_scm_substring_copy
1304{
e1b29f6a
MV
1305 /* For the Scheme version, START is mandatory, but for the C
1306 version, it is optional. See scm_string_copy in srfi-13.c for a
1307 rationale.
1308 */
1309
1310 size_t from, to;
3ee86942
MV
1311
1312 SCM_VALIDATE_STRING (1, str);
e1b29f6a
MV
1313 scm_i_get_substring_spec (scm_i_string_length (str),
1314 start, &from, end, &to);
3ee86942
MV
1315 return scm_i_substring_copy (str, from, to);
1316}
1317#undef FUNC_NAME
1318
1319SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1320 (SCM str, SCM start, SCM end),
1321 "Return string that indirectly refers to the characters\n"
1322 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1323 "ending with index @var{end} (exclusive).\n"
1324 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1325 "exact integers satisfying:\n\n"
1326 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1327#define FUNC_NAME s_scm_substring_shared
1328{
1329 size_t len, from, to;
1330
1331 SCM_VALIDATE_STRING (1, str);
1332 len = scm_i_string_length (str);
1333 from = scm_to_unsigned_integer (start, 0, len);
1334 if (SCM_UNBNDP (end))
1335 to = len;
1336 else
1337 to = scm_to_unsigned_integer (end, from, len);
1338 return scm_i_substring_shared (str, from, to);
1339}
1340#undef FUNC_NAME
685c0d71 1341
3b3b36dd 1342SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
6fa73e72 1343 (SCM args),
9c44cd45 1344 "Return a newly allocated string whose characters form the\n"
0d26a824 1345 "concatenation of the given strings, @var{args}.")
1bbd0b84 1346#define FUNC_NAME s_scm_string_append
0f2d19dd
JB
1347{
1348 SCM res;
9c44cd45
MG
1349 size_t len = 0;
1350 int wide = 0;
c829a427 1351 SCM l, s;
bd4911ef 1352 size_t i;
9909c395
MG
1353 union
1354 {
1355 char *narrow;
1356 scm_t_wchar *wide;
1357 } data;
af45e3b0
DH
1358
1359 SCM_VALIDATE_REST_ARGUMENT (args);
9c44cd45 1360 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427
MV
1361 {
1362 s = SCM_CAR (l);
1363 SCM_VALIDATE_STRING (SCM_ARGn, s);
9c44cd45
MG
1364 len += scm_i_string_length (s);
1365 if (!scm_i_is_narrow_string (s))
1366 wide = 1;
c829a427 1367 }
9909c395 1368 data.narrow = NULL;
9c44cd45 1369 if (!wide)
9909c395 1370 res = scm_i_make_string (len, &data.narrow);
9c44cd45 1371 else
9909c395 1372 res = scm_i_make_wide_string (len, &data.wide);
9c44cd45
MG
1373
1374 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
c829a427 1375 {
edea856c 1376 size_t len;
c829a427 1377 s = SCM_CAR (l);
3ee86942 1378 SCM_VALIDATE_STRING (SCM_ARGn, s);
edea856c 1379 len = scm_i_string_length (s);
9c44cd45
MG
1380 if (!wide)
1381 {
9909c395
MG
1382 memcpy (data.narrow, scm_i_string_chars (s), len);
1383 data.narrow += len;
9c44cd45
MG
1384 }
1385 else
1386 {
1387 if (scm_i_is_narrow_string (s))
1388 {
1389 for (i = 0; i < scm_i_string_length (s); i++)
9909c395 1390 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
9c44cd45
MG
1391 }
1392 else
9909c395 1393 u32_cpy ((scm_t_uint32 *) data.wide,
9c44cd45 1394 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
9909c395 1395 data.wide += len;
9c44cd45 1396 }
c829a427
MV
1397 scm_remember_upto_here_1 (s);
1398 }
0f2d19dd
JB
1399 return res;
1400}
1bbd0b84 1401#undef FUNC_NAME
0f2d19dd 1402
c829a427
MV
1403int
1404scm_is_string (SCM obj)
1405{
3ee86942 1406 return IS_STRING (obj);
c829a427 1407}
24933780 1408
a3d7d5d5
LC
1409\f
1410/* Conversion to/from other encodings. */
1411
1412SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
d14418a5 1413void
ef7e4ba3
LC
1414scm_encoding_error (const char *subr, int err, const char *message,
1415 const char *from, const char *to, SCM string_or_bv)
1416{
1417 /* Raise an exception that conveys all the information needed to debug the
1418 problem. Only perform locale conversions that are safe; in particular,
1419 don't try to display STRING_OR_BV when it's a string since converting it to
1420 the output locale may fail. */
1421 scm_throw (scm_encoding_error_key,
1422 scm_list_n (scm_from_locale_string (subr),
1423 scm_from_locale_string (message),
1424 scm_from_int (err),
1425 scm_from_locale_string (from),
1426 scm_from_locale_string (to),
1427 string_or_bv,
1428 SCM_UNDEFINED));
a3d7d5d5
LC
1429}
1430
fac32b51 1431SCM
587a3355
MG
1432scm_from_stringn (const char *str, size_t len, const char *encoding,
1433 scm_t_string_failed_conversion_handler handler)
1434{
1435 size_t u32len, i;
1436 scm_t_wchar *u32;
1437 int wide = 0;
1438 SCM res;
1439
fac32b51
MG
1440 if (len == 0)
1441 return scm_nullstr;
1442
889975e5
MG
1443 if (encoding == NULL)
1444 {
1445 /* If encoding is null, use Latin-1. */
1446 char *buf;
1447 res = scm_i_make_string (len, &buf);
1448 memcpy (buf, str, len);
1449 return res;
1450 }
1451
587a3355
MG
1452 u32len = 0;
1453 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1454 (enum iconv_ilseq_handler)
1455 handler,
1456 str, len,
1457 NULL,
1458 NULL, &u32len);
1459
ef7e4ba3 1460 if (SCM_UNLIKELY (u32 == NULL))
587a3355 1461 {
ef7e4ba3
LC
1462 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1463 handler. */
1464 SCM bv;
1465 signed char *buf;
1466
1467 buf = scm_gc_malloc_pointerless (len, "bytevector");
1468 memcpy (buf, str, len);
1469 bv = scm_c_take_bytevector (buf, len);
1470
1471 scm_encoding_error (__func__, errno,
1472 "input locale conversion error",
1473 encoding, "UTF-32", bv);
587a3355
MG
1474 }
1475
1476 i = 0;
1477 while (i < u32len)
1478 if (u32[i++] > 0xFF)
1479 {
1480 wide = 1;
1481 break;
1482 }
1483
1484 if (!wide)
1485 {
1486 char *dst;
1487 res = scm_i_make_string (u32len, &dst);
1488 for (i = 0; i < u32len; i ++)
1489 dst[i] = (unsigned char) u32[i];
1490 dst[u32len] = '\0';
1491 }
1492 else
1493 {
1494 scm_t_wchar *wdst;
1495 res = scm_i_make_wide_string (u32len, &wdst);
1496 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1497 wdst[u32len] = 0;
1498 }
1499
1500 free (u32);
1501 return res;
1502}
1503
cf313a94
MG
1504SCM
1505scm_from_latin1_stringn (const char *str, size_t len)
1506{
1507 return scm_from_stringn (str, len, NULL, SCM_FAILED_CONVERSION_ERROR);
1508}
1509
c829a427
MV
1510SCM
1511scm_from_locale_stringn (const char *str, size_t len)
1512{
889975e5
MG
1513 const char *enc;
1514 scm_t_string_failed_conversion_handler hndl;
1515 SCM inport;
1516 scm_t_port *pt;
4d4528e7 1517
9c44cd45 1518 if (len == (size_t) -1)
c829a427 1519 len = strlen (str);
9c44cd45
MG
1520 if (len == 0)
1521 return scm_nullstr;
1522
889975e5
MG
1523 inport = scm_current_input_port ();
1524 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1525 {
1526 pt = SCM_PTAB_ENTRY (inport);
1527 enc = pt->encoding;
1528 hndl = pt->ilseq_handler;
1529 }
1530 else
1531 {
1532 enc = NULL;
1533 hndl = SCM_FAILED_CONVERSION_ERROR;
1534 }
1535
1536 return scm_from_stringn (str, len, enc, hndl);
c829a427 1537}
4d4528e7 1538
c829a427
MV
1539SCM
1540scm_from_locale_string (const char *str)
4d4528e7 1541{
9c44cd45
MG
1542 if (str == NULL)
1543 return scm_nullstr;
1544
c829a427
MV
1545 return scm_from_locale_stringn (str, -1);
1546}
4d4528e7 1547
587a3355
MG
1548SCM
1549scm_i_from_utf8_string (const scm_t_uint8 *str)
1550{
1551 return scm_from_stringn ((const char *) str,
1552 strlen ((char *) str), "UTF-8",
1553 SCM_FAILED_CONVERSION_ERROR);
1554}
1555
50b1996f
MG
1556/* Create a new scheme string from the C string STR. The memory of
1557 STR may be used directly as storage for the new string. */
13a94556
LC
1558/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1559 would be to register a finalizer to eventually free(3) STR, which isn't
1560 worth it. Should we just deprecate the `scm_take_' functions? */
c829a427
MV
1561SCM
1562scm_take_locale_stringn (char *str, size_t len)
1563{
13a94556 1564 SCM res;
48ddf0d9 1565
13a94556
LC
1566 res = scm_from_locale_stringn (str, len);
1567 free (str);
c829a427 1568
c829a427
MV
1569 return res;
1570}
1571
48ddf0d9
KR
1572SCM
1573scm_take_locale_string (char *str)
1574{
1575 return scm_take_locale_stringn (str, -1);
1576}
1577
9c44cd45
MG
1578/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1579 and \UXXXXXX. */
4ff2b9f4
LC
1580void
1581scm_i_unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
9c44cd45
MG
1582{
1583 char *before, *after;
1584 size_t i, j;
1585
4ff2b9f4
LC
1586 before = buf;
1587 after = buf;
9c44cd45
MG
1588 i = 0;
1589 j = 0;
1590 while (i < *lenp)
1591 {
1592 if ((i <= *lenp - 6)
1593 && before[i] == '\\'
1594 && before[i + 1] == 'u'
1595 && before[i + 2] == '0' && before[i + 3] == '0')
1596 {
1597 /* Convert \u00NN to \xNN */
1598 after[j] = '\\';
1599 after[j + 1] = 'x';
30a6b9ca
MG
1600 after[j + 2] = tolower ((int) before[i + 4]);
1601 after[j + 3] = tolower ((int) before[i + 5]);
9c44cd45
MG
1602 i += 6;
1603 j += 4;
1604 }
1605 else if ((i <= *lenp - 10)
1606 && before[i] == '\\'
1607 && before[i + 1] == 'U'
1608 && before[i + 2] == '0' && before[i + 3] == '0')
1609 {
1610 /* Convert \U00NNNNNN to \UNNNNNN */
1611 after[j] = '\\';
1612 after[j + 1] = 'U';
30a6b9ca
MG
1613 after[j + 2] = tolower ((int) before[i + 4]);
1614 after[j + 3] = tolower ((int) before[i + 5]);
1615 after[j + 4] = tolower ((int) before[i + 6]);
1616 after[j + 5] = tolower ((int) before[i + 7]);
1617 after[j + 6] = tolower ((int) before[i + 8]);
1618 after[j + 7] = tolower ((int) before[i + 9]);
9c44cd45
MG
1619 i += 10;
1620 j += 8;
1621 }
1622 else
1623 {
1624 after[j] = before[i];
1625 i++;
1626 j++;
1627 }
1628 }
1629 *lenp = j;
9c44cd45
MG
1630}
1631
d31b9519 1632/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXXXX; */
4ff2b9f4
LC
1633void
1634scm_i_unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
d31b9519
MG
1635{
1636 char *before, *after;
1637 size_t i, j;
1638 /* The worst case is if the input string contains all 4-digit hex escapes.
1639 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1640 size_t max_out_len = (*lenp * 7) / 6 + 1;
1641 size_t nzeros, ndigits;
1642
4ff2b9f4 1643 before = buf;
d31b9519
MG
1644 after = alloca (max_out_len);
1645 i = 0;
1646 j = 0;
1647 while (i < *lenp)
1648 {
1649 if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
1650 || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
1651 {
1652 if (before[i + 1] == 'u')
1653 ndigits = 4;
1654 else if (before[i + 1] == 'U')
1655 ndigits = 8;
1656 else
1657 abort ();
1658
1659 /* Add the R6RS hex escape initial sequence. */
1660 after[j] = '\\';
1661 after[j + 1] = 'x';
1662
1663 /* Move string positions to the start of the hex numbers. */
1664 i += 2;
1665 j += 2;
1666
1667 /* Find the number of initial zeros in this hex number. */
1668 nzeros = 0;
1669 while (before[i + nzeros] == '0' && nzeros < ndigits)
1670 nzeros++;
1671
1672 /* Copy the number, skipping initial zeros, and then move the string
1673 positions. */
1674 if (nzeros == ndigits)
1675 {
1676 after[j] = '0';
1677 i += ndigits;
1678 j += 1;
1679 }
1680 else
1681 {
1682 int pos;
1683 for (pos = 0; pos < ndigits - nzeros; pos++)
1684 after[j + pos] = tolower ((int) before[i + nzeros + pos]);
1685 i += ndigits;
1686 j += (ndigits - nzeros);
1687 }
1688
1689 /* Add terminating semicolon. */
1690 after[j] = ';';
1691 j++;
1692 }
1693 else
1694 {
1695 after[j] = before[i];
1696 i++;
1697 j++;
1698 }
1699 }
1700 *lenp = j;
d31b9519
MG
1701 memcpy (before, after, j);
1702}
1703
cf313a94
MG
1704char *
1705scm_to_latin1_stringn (SCM str, size_t *lenp)
1706{
1707 return scm_to_stringn (str, lenp, NULL, SCM_FAILED_CONVERSION_ERROR);
1708}
d31b9519 1709
c829a427 1710char *
fac32b51 1711scm_to_locale_stringn (SCM str, size_t *lenp)
c829a427 1712{
889975e5
MG
1713 SCM outport;
1714 scm_t_port *pt;
9c44cd45
MG
1715 const char *enc;
1716
889975e5
MG
1717 outport = scm_current_output_port ();
1718 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1719 {
1720 pt = SCM_PTAB_ENTRY (outport);
1721 enc = pt->encoding;
1722 }
1723 else
1724 enc = NULL;
9c44cd45 1725
889975e5
MG
1726 return scm_to_stringn (str, lenp,
1727 enc,
1728 scm_i_get_conversion_strategy (SCM_BOOL_F));
9c44cd45
MG
1729}
1730
29bcdbb0
LC
1731/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1732 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1733 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1734 defined by HANDLER. */
9c44cd45 1735char *
587a3355 1736scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
eca29b02 1737 scm_t_string_failed_conversion_handler handler)
9c44cd45 1738{
9c44cd45
MG
1739 char *buf;
1740 size_t ilen, len, i;
889975e5
MG
1741 int ret;
1742 const char *enc;
4d4528e7 1743
3ee86942 1744 if (!scm_is_string (str))
c829a427 1745 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1746 ilen = scm_i_string_length (str);
1747
1748 if (ilen == 0)
1749 {
1750 buf = scm_malloc (1);
1751 buf[0] = '\0';
1752 if (lenp)
1753 *lenp = 0;
1754 return buf;
1755 }
587a3355 1756
c829a427 1757 if (lenp == NULL)
9c44cd45
MG
1758 for (i = 0; i < ilen; i++)
1759 if (scm_i_string_ref (str, i) == '\0')
1760 scm_misc_error (NULL,
1761 "string contains #\\nul character: ~S",
1762 scm_list_1 (str));
1763
889975e5 1764 if (scm_i_is_narrow_string (str) && (encoding == NULL))
c829a427 1765 {
889975e5
MG
1766 /* If using native Latin-1 encoding, just copy the string
1767 contents. */
9c44cd45
MG
1768 if (lenp)
1769 {
1770 buf = scm_malloc (ilen);
1771 memcpy (buf, scm_i_string_chars (str), ilen);
1772 *lenp = ilen;
1773 return buf;
1774 }
1775 else
1776 {
1777 buf = scm_malloc (ilen + 1);
1778 memcpy (buf, scm_i_string_chars (str), ilen);
1779 buf[ilen] = '\0';
1780 return buf;
1781 }
c829a427 1782 }
9c44cd45 1783
587a3355 1784
9c44cd45
MG
1785 buf = NULL;
1786 len = 0;
889975e5
MG
1787 enc = encoding;
1788 if (enc == NULL)
1789 enc = "ISO-8859-1";
1790 if (scm_i_is_narrow_string (str))
1791 {
1792 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1793 "ISO-8859-1", enc,
1794 (enum iconv_ilseq_handler) handler, NULL,
1795 &buf, &len);
9c44cd45 1796
889975e5 1797 if (ret != 0)
ef7e4ba3
LC
1798 scm_encoding_error (__func__, errno,
1799 "cannot convert to output locale",
1800 "ISO-8859-1", enc, str);
889975e5
MG
1801 }
1802 else
1803 {
d31b9519 1804 buf = u32_conv_to_encoding (enc,
889975e5 1805 (enum iconv_ilseq_handler) handler,
d31b9519 1806 (scm_t_uint32 *) scm_i_string_wide_chars (str),
889975e5
MG
1807 ilen,
1808 NULL,
1809 NULL, &len);
1810 if (buf == NULL)
ef7e4ba3
LC
1811 scm_encoding_error (__func__, errno,
1812 "cannot convert to output locale",
1813 "UTF-32", enc, str);
d31b9519
MG
1814 }
1815 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1816 {
1817 if (SCM_R6RS_ESCAPES_P)
4ff2b9f4 1818 scm_i_unistring_escapes_to_r6rs_escapes (buf, &len);
d31b9519 1819 else
4ff2b9f4
LC
1820 scm_i_unistring_escapes_to_guile_escapes (buf, &len);
1821
1822 buf = scm_realloc (buf, len);
889975e5 1823 }
9c44cd45 1824 if (lenp)
4d4528e7 1825 *lenp = len;
9c44cd45
MG
1826 else
1827 {
1828 buf = scm_realloc (buf, len + 1);
1829 buf[len] = '\0';
1830 }
24933780 1831
c829a427 1832 scm_remember_upto_here_1 (str);
9c44cd45 1833 return buf;
4d4528e7 1834}
af68e5e5 1835
c829a427
MV
1836char *
1837scm_to_locale_string (SCM str)
1838{
1839 return scm_to_locale_stringn (str, NULL);
1840}
af68e5e5 1841
587a3355
MG
1842scm_t_uint8 *
1843scm_i_to_utf8_string (SCM str)
1844{
1845 char *u8str;
1846 u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1847 return (scm_t_uint8 *) u8str;
1848}
1849
c829a427
MV
1850size_t
1851scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1852{
1853 size_t len;
9c44cd45 1854 char *result = NULL;
3ee86942 1855 if (!scm_is_string (str))
c829a427 1856 scm_wrong_type_arg_msg (NULL, 0, str, "string");
9c44cd45
MG
1857 result = scm_to_locale_stringn (str, &len);
1858
1859 memcpy (buf, result, (len > max_len) ? max_len : len);
1860 free (result);
1861
c829a427
MV
1862 scm_remember_upto_here_1 (str);
1863 return len;
1864}
af68e5e5 1865
a3d7d5d5
LC
1866\f
1867/* Unicode string normalization. */
1868
edb7bb47
JG
1869/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1870 libguile/i18n.c. It would be useful to have this factored out into a more
1871 convenient location, but its use of alloca makes that tricky to do. */
1872
1873static SCM
1874normalize_str (SCM string, uninorm_t form)
1875{
1876 SCM ret;
1877 scm_t_uint32 *w_str;
1878 scm_t_wchar *cbuf;
1879 size_t rlen, len = scm_i_string_length (string);
1880
1881 if (scm_i_is_narrow_string (string))
1882 {
1883 size_t i;
1884 const char *buf = scm_i_string_chars (string);
1885
1886 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1887
1888 for (i = 0; i < len; i ++)
1889 w_str[i] = (unsigned char) buf[i];
1890 w_str[len] = 0;
1891 }
d8164b04
JG
1892 else
1893 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1894
edb7bb47
JG
1895 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1896
1897 ret = scm_i_make_wide_string (rlen, &cbuf);
1898 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1899 free (w_str);
d8164b04
JG
1900
1901 scm_i_try_narrow_string (ret);
1902
edb7bb47
JG
1903 return ret;
1904}
1905
1906SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1907 (SCM string),
1908 "Returns the NFC normalized form of @var{string}.")
1909#define FUNC_NAME s_scm_string_normalize_nfc
1910{
1911 SCM_VALIDATE_STRING (1, string);
1912 return normalize_str (string, UNINORM_NFC);
1913}
1914#undef FUNC_NAME
1915
1916SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1917 (SCM string),
1918 "Returns the NFD normalized form of @var{string}.")
1919#define FUNC_NAME s_scm_string_normalize_nfd
1920{
1921 SCM_VALIDATE_STRING (1, string);
1922 return normalize_str (string, UNINORM_NFD);
1923}
1924#undef FUNC_NAME
1925
1926SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1927 (SCM string),
1928 "Returns the NFKC normalized form of @var{string}.")
1929#define FUNC_NAME s_scm_string_normalize_nfkc
1930{
1931 SCM_VALIDATE_STRING (1, string);
1932 return normalize_str (string, UNINORM_NFKC);
1933}
1934#undef FUNC_NAME
1935
1936SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1937 (SCM string),
1938 "Returns the NFKD normalized form of @var{string}.")
1939#define FUNC_NAME s_scm_string_normalize_nfkd
1940{
1941 SCM_VALIDATE_STRING (1, string);
1942 return normalize_str (string, UNINORM_NFKD);
1943}
1944#undef FUNC_NAME
1945
3ee86942
MV
1946/* converts C scm_array of strings to SCM scm_list of strings. */
1947/* If argc < 0, a null terminated scm_array is assumed. */
9c44cd45 1948SCM
3ee86942
MV
1949scm_makfromstrs (int argc, char **argv)
1950{
1951 int i = argc;
1952 SCM lst = SCM_EOL;
1953 if (0 > i)
1954 for (i = 0; argv[i]; i++);
1955 while (i--)
1956 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1957 return lst;
1958}
1959
c829a427
MV
1960/* Return a newly allocated array of char pointers to each of the strings
1961 in args, with a terminating NULL pointer. */
1962
1963char **
1964scm_i_allocate_string_pointers (SCM list)
2a776823 1965#define FUNC_NAME "scm_i_allocate_string_pointers"
af68e5e5 1966{
c829a427
MV
1967 char **result;
1968 int len = scm_ilength (list);
1969 int i;
1970
1971 if (len < 0)
1972 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1973
2a776823
LC
1974 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1975 "string pointers");
c829a427 1976 result[len] = NULL;
c829a427
MV
1977
1978 /* The list might be have been modified in another thread, so
1979 we check LIST before each access.
1980 */
d2e53ed6 1981 for (i = 0; i < len && scm_is_pair (list); i++)
c829a427 1982 {
2a776823
LC
1983 SCM str;
1984 size_t len;
1985
1986 str = SCM_CAR (list);
1987 len = scm_c_string_length (str);
1988
1989 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1990 memcpy (result[i], scm_i_string_chars (str), len);
1991 result[i][len] = '\0';
1992
c829a427
MV
1993 list = SCM_CDR (list);
1994 }
1995
c829a427 1996 return result;
af68e5e5 1997}
2a776823 1998#undef FUNC_NAME
24933780 1999
6f14f578
MV
2000void
2001scm_i_get_substring_spec (size_t len,
2002 SCM start, size_t *cstart,
2003 SCM end, size_t *cend)
2004{
2005 if (SCM_UNBNDP (start))
2006 *cstart = 0;
2007 else
2008 *cstart = scm_to_unsigned_integer (start, 0, len);
2009
2010 if (SCM_UNBNDP (end))
2011 *cend = len;
2012 else
2013 *cend = scm_to_unsigned_integer (end, *cstart, len);
2014}
2015
3ee86942
MV
2016#if SCM_ENABLE_DEPRECATED
2017
556d75db
MV
2018/* When these definitions are removed, it becomes reasonable to use
2019 read-only strings for string literals. For that, change the reader
2020 to create string literals with scm_c_substring_read_only instead of
2021 with scm_c_substring_copy.
2022*/
2023
3ee86942 2024int
fe78c51a 2025scm_i_deprecated_stringp (SCM str)
3ee86942
MV
2026{
2027 scm_c_issue_deprecation_warning
2028 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
2029
2616f0e0 2030 return scm_is_string (str);
3ee86942
MV
2031}
2032
2033char *
fe78c51a 2034scm_i_deprecated_string_chars (SCM str)
3ee86942
MV
2035{
2036 char *chars;
2037
2038 scm_c_issue_deprecation_warning
2039 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
2040
2616f0e0
MV
2041 /* We don't accept shared substrings here since they are not
2042 null-terminated.
2043 */
2044 if (IS_SH_STRING (str))
c291b588
LC
2045 scm_misc_error (NULL,
2046 "SCM_STRING_CHARS does not work with shared substrings",
2616f0e0
MV
2047 SCM_EOL);
2048
877f06c3 2049 /* We explicitly test for read-only strings to produce a better
556d75db
MV
2050 error message.
2051 */
2052
2053 if (IS_RO_STRING (str))
c291b588
LC
2054 scm_misc_error (NULL,
2055 "SCM_STRING_CHARS does not work with read-only strings",
556d75db 2056 SCM_EOL);
c291b588 2057
2616f0e0 2058 /* The following is still wrong, of course...
3ee86942 2059 */
9c44cd45 2060 str = scm_i_string_start_writing (str);
3ee86942
MV
2061 chars = scm_i_string_writable_chars (str);
2062 scm_i_string_stop_writing ();
2063 return chars;
2064}
2065
2066size_t
fe78c51a 2067scm_i_deprecated_string_length (SCM str)
3ee86942
MV
2068{
2069 scm_c_issue_deprecation_warning
2070 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
2071 return scm_c_string_length (str);
2072}
2073
2074#endif
2075
2a610be5
AW
2076static SCM
2077string_handle_ref (scm_t_array_handle *h, size_t index)
2078{
2079 return scm_c_string_ref (h->array, index);
2080}
2081
2082static void
2083string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
2084{
2085 scm_c_string_set_x (h->array, index, val);
2086}
2087
2088static void
2089string_get_handle (SCM v, scm_t_array_handle *h)
2090{
2091 h->array = v;
2092 h->ndims = 1;
2093 h->dims = &h->dim0;
2094 h->dim0.lbnd = 0;
2095 h->dim0.ubnd = scm_c_string_length (v) - 1;
2096 h->dim0.inc = 1;
2097 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
2098 h->elements = h->writable_elements = NULL;
2099}
2100
c5f17102 2101SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
2a610be5 2102 string_handle_ref, string_handle_set,
f65e0168
LC
2103 string_get_handle)
2104SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
2a610be5 2105
0f2d19dd
JB
2106void
2107scm_init_strings ()
0f2d19dd 2108{
3ee86942 2109 scm_nullstr = scm_i_make_string (0, NULL);
7c33806a 2110
a0599745 2111#include "libguile/strings.x"
0f2d19dd
JB
2112}
2113
89e00824
ML
2114
2115/*
2116 Local Variables:
2117 c-file-style: "gnu"
2118 End:
2119*/