Revert "Install `standard-library.info'."
[bpt/guile.git] / libguile / strings.c
... / ...
CommitLineData
1/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20\f
21#ifdef HAVE_CONFIG_H
22# include <config.h>
23#endif
24
25#include <string.h>
26#include <stdio.h>
27#include <ctype.h>
28#include <uninorm.h>
29#include <unistr.h>
30#include <uniconv.h>
31
32#include "striconveh.h"
33
34#include "libguile/_scm.h"
35#include "libguile/chars.h"
36#include "libguile/root.h"
37#include "libguile/strings.h"
38#include "libguile/error.h"
39#include "libguile/generalized-vectors.h"
40#include "libguile/deprecation.h"
41#include "libguile/validate.h"
42
43\f
44
45/* {Strings}
46 */
47
48
49/* Stringbufs
50 *
51 * XXX - keeping an accurate refcount during GC seems to be quite
52 * tricky, so we just keep score of whether a stringbuf might be
53 * shared, not whether it definitely is.
54 *
55 * The scheme I (mvo) tried to keep an accurate reference count would
56 * recount all strings that point to a stringbuf during the mark-phase
57 * of the GC. This was done since one cannot access the stringbuf of
58 * a string when that string is freed (in order to decrease the
59 * reference count). The memory of the stringbuf might have been
60 * reused already for something completely different.
61 *
62 * This recounted worked for a small number of threads beating on
63 * cow-strings, but it failed randomly with more than 10 threads, say.
64 * I couldn't figure out what went wrong, so I used the conservative
65 * approach implemented below.
66 *
67 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
68 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
69 * strings.
70 */
71
72/* The size in words of the stringbuf header (type tag + size). */
73#define STRINGBUF_HEADER_SIZE 2U
74
75#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
76
77#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
78#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
79
80#define STRINGBUF_TAG scm_tc7_stringbuf
81#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
82#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
83
84#define STRINGBUF_CHARS(buf) ((unsigned char *) \
85 SCM_CELL_OBJECT_LOC (buf, \
86 STRINGBUF_HEADER_SIZE))
87#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
88
89#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CHARS (buf))
90
91#define SET_STRINGBUF_SHARED(buf) \
92 do \
93 { \
94 /* Don't modify BUF if it's already marked as shared since it might be \
95 a read-only, statically allocated stringbuf. */ \
96 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
97 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
98 } \
99 while (0)
100
101#ifdef SCM_STRING_LENGTH_HISTOGRAM
102static size_t lenhist[1001];
103#endif
104
105/* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
106 characters. */
107static SCM
108make_stringbuf (size_t len)
109{
110 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
111 scm_i_symbol_chars, all stringbufs are null-terminated. Once
112 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
113 has been changed for scm_i_symbol_chars, this null-termination
114 can be dropped.
115 */
116
117 SCM buf;
118
119#ifdef SCM_STRING_LENGTH_HISTOGRAM
120 if (len < 1000)
121 lenhist[len]++;
122 else
123 lenhist[1000]++;
124#endif
125
126 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
127 "string"));
128
129 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
130 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
131
132 STRINGBUF_CHARS (buf)[len] = 0;
133
134 return buf;
135}
136
137/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
138 characters. */
139static SCM
140make_wide_stringbuf (size_t len)
141{
142 SCM buf;
143 size_t raw_len;
144
145#ifdef SCM_STRING_LENGTH_HISTOGRAM
146 if (len < 1000)
147 lenhist[len]++;
148 else
149 lenhist[1000]++;
150#endif
151
152 raw_len = (len + 1) * sizeof (scm_t_wchar);
153 buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
154 "string"));
155
156 SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
157 SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
158
159 STRINGBUF_WIDE_CHARS (buf)[len] = 0;
160
161 return buf;
162}
163
164/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
165 characters from BUF. */
166static SCM
167wide_stringbuf (SCM buf)
168{
169 SCM new_buf;
170
171 if (STRINGBUF_WIDE (buf))
172 new_buf = buf;
173 else
174 {
175 size_t i, len;
176 scm_t_wchar *mem;
177
178 len = STRINGBUF_LENGTH (buf);
179
180 new_buf = make_wide_stringbuf (len);
181
182 mem = STRINGBUF_WIDE_CHARS (new_buf);
183 for (i = 0; i < len; i++)
184 mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
185 mem[len] = 0;
186 }
187
188 return new_buf;
189}
190
191/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
192 characters from BUF, if possible. */
193static SCM
194narrow_stringbuf (SCM buf)
195{
196 SCM new_buf;
197
198 if (!STRINGBUF_WIDE (buf))
199 new_buf = buf;
200 else
201 {
202 size_t i, len;
203 scm_t_wchar *wmem;
204 unsigned char *mem;
205
206 len = STRINGBUF_LENGTH (buf);
207 wmem = STRINGBUF_WIDE_CHARS (buf);
208
209 for (i = 0; i < len; i++)
210 if (wmem[i] > 0xFF)
211 /* BUF cannot be narrowed. */
212 return buf;
213
214 new_buf = make_stringbuf (len);
215
216 mem = STRINGBUF_CHARS (new_buf);
217 for (i = 0; i < len; i++)
218 mem[i] = (unsigned char) wmem[i];
219 mem[len] = 0;
220 }
221
222 return new_buf;
223}
224
225scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
226
227\f
228/* Copy-on-write strings.
229 */
230
231#define STRING_TAG scm_tc7_string
232
233#define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
234#define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
235#define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
236
237#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
238#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
239
240#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
241
242/* Read-only strings.
243 */
244
245#define RO_STRING_TAG scm_tc7_ro_string
246#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
247
248/* Mutation-sharing substrings
249 */
250
251#define SH_STRING_TAG (scm_tc7_string + 0x100)
252
253#define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
254/* START and LENGTH as for STRINGs. */
255
256#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
257
258SCM scm_nullstr;
259
260/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
261 characters. CHARSP, if not NULL, will be set to location of the
262 char array. */
263SCM
264scm_i_make_string (size_t len, char **charsp)
265{
266 SCM buf = make_stringbuf (len);
267 SCM res;
268 if (charsp)
269 *charsp = (char *) STRINGBUF_CHARS (buf);
270 res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
271 (scm_t_bits)0, (scm_t_bits) len);
272 return res;
273}
274
275/* Create a scheme string with space for LEN 32-bit UCS-4-encoded
276 characters. CHARSP, if not NULL, will be set to location of the
277 character array. */
278SCM
279scm_i_make_wide_string (size_t len, scm_t_wchar **charsp)
280{
281 SCM buf = make_wide_stringbuf (len);
282 SCM res;
283 if (charsp)
284 *charsp = STRINGBUF_WIDE_CHARS (buf);
285 res = scm_double_cell (STRING_TAG, SCM_UNPACK (buf),
286 (scm_t_bits) 0, (scm_t_bits) len);
287 return res;
288}
289
290static void
291validate_substring_args (SCM str, size_t start, size_t end)
292{
293 if (!IS_STRING (str))
294 scm_wrong_type_arg_msg (NULL, 0, str, "string");
295 if (start > STRING_LENGTH (str))
296 scm_out_of_range (NULL, scm_from_size_t (start));
297 if (end > STRING_LENGTH (str) || end < start)
298 scm_out_of_range (NULL, scm_from_size_t (end));
299}
300
301static inline void
302get_str_buf_start (SCM *str, SCM *buf, size_t *start)
303{
304 *start = STRING_START (*str);
305 if (IS_SH_STRING (*str))
306 {
307 *str = SH_STRING_STRING (*str);
308 *start += STRING_START (*str);
309 }
310 *buf = STRING_STRINGBUF (*str);
311}
312
313SCM
314scm_i_substring (SCM str, size_t start, size_t end)
315{
316 SCM buf;
317 size_t str_start;
318 get_str_buf_start (&str, &buf, &str_start);
319 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
320 SET_STRINGBUF_SHARED (buf);
321 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
322 return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
323 (scm_t_bits)str_start + start,
324 (scm_t_bits) end - start);
325}
326
327SCM
328scm_i_substring_read_only (SCM str, size_t start, size_t end)
329{
330 SCM buf;
331 size_t str_start;
332 get_str_buf_start (&str, &buf, &str_start);
333 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
334 SET_STRINGBUF_SHARED (buf);
335 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
336 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
337 (scm_t_bits)str_start + start,
338 (scm_t_bits) end - start);
339}
340
341SCM
342scm_i_substring_copy (SCM str, size_t start, size_t end)
343{
344 size_t len = end - start;
345 SCM buf, my_buf;
346 size_t str_start;
347 get_str_buf_start (&str, &buf, &str_start);
348 if (scm_i_is_narrow_string (str))
349 {
350 my_buf = make_stringbuf (len);
351 memcpy (STRINGBUF_CHARS (my_buf),
352 STRINGBUF_CHARS (buf) + str_start + start, len);
353 }
354 else
355 {
356 my_buf = make_wide_stringbuf (len);
357 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
358 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
359 + start), len);
360 /* Even though this string is wide, the substring may be narrow.
361 Consider adding code to narrow the string. */
362 }
363 scm_remember_upto_here_1 (buf);
364 return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
365 (scm_t_bits) 0, (scm_t_bits) len);
366}
367
368SCM
369scm_i_substring_shared (SCM str, size_t start, size_t end)
370{
371 if (start == 0 && end == STRING_LENGTH (str))
372 return str;
373 else
374 {
375 size_t len = end - start;
376 if (IS_SH_STRING (str))
377 {
378 start += STRING_START (str);
379 str = SH_STRING_STRING (str);
380 }
381 return scm_double_cell (SH_STRING_TAG, SCM_UNPACK(str),
382 (scm_t_bits)start, (scm_t_bits) len);
383 }
384}
385
386SCM
387scm_c_substring (SCM str, size_t start, size_t end)
388{
389 validate_substring_args (str, start, end);
390 return scm_i_substring (str, start, end);
391}
392
393SCM
394scm_c_substring_read_only (SCM str, size_t start, size_t end)
395{
396 validate_substring_args (str, start, end);
397 return scm_i_substring_read_only (str, start, end);
398}
399
400SCM
401scm_c_substring_copy (SCM str, size_t start, size_t end)
402{
403 validate_substring_args (str, start, end);
404 return scm_i_substring_copy (str, start, end);
405}
406
407SCM
408scm_c_substring_shared (SCM str, size_t start, size_t end)
409{
410 validate_substring_args (str, start, end);
411 return scm_i_substring_shared (str, start, end);
412}
413
414\f
415/* Internal accessors
416 */
417
418/* Returns the number of characters in STR. This may be different
419 than the memory size of the string storage. */
420size_t
421scm_i_string_length (SCM str)
422{
423 return STRING_LENGTH (str);
424}
425
426/* True if the string is 'narrow', meaning it has a 8-bit Latin-1
427 encoding. False if it is 'wide', having a 32-bit UCS-4
428 encoding. */
429int
430scm_i_is_narrow_string (SCM str)
431{
432 return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
433}
434
435/* Try to coerce a string to be narrow. It if is narrow already, do
436 nothing. If it is wide, shrink it to narrow if none of its
437 characters are above 0xFF. Return true if the string is narrow or
438 was made to be narrow. */
439int
440scm_i_try_narrow_string (SCM str)
441{
442 SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
443
444 return scm_i_is_narrow_string (str);
445}
446
447/* Returns a pointer to the 8-bit Latin-1 encoded character array of
448 STR. */
449const char *
450scm_i_string_chars (SCM str)
451{
452 SCM buf;
453 size_t start;
454 get_str_buf_start (&str, &buf, &start);
455 if (scm_i_is_narrow_string (str))
456 return (const char *) STRINGBUF_CHARS (buf) + start;
457 else
458 scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
459 scm_list_1 (str));
460 return NULL;
461}
462
463/* Returns a pointer to the 32-bit UCS-4 encoded character array of
464 STR. */
465const scm_t_wchar *
466scm_i_string_wide_chars (SCM str)
467{
468 SCM buf;
469 size_t start;
470
471 get_str_buf_start (&str, &buf, &start);
472 if (!scm_i_is_narrow_string (str))
473 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
474 else
475 scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
476 scm_list_1 (str));
477}
478
479/* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
480 a new string buffer, so that it can be modified without modifying
481 other strings. Also, lock the string mutex. Later, one must call
482 scm_i_string_stop_writing to unlock the mutex. */
483SCM
484scm_i_string_start_writing (SCM orig_str)
485{
486 SCM buf, str = orig_str;
487 size_t start;
488
489 get_str_buf_start (&str, &buf, &start);
490 if (IS_RO_STRING (str))
491 scm_misc_error (NULL, "string is read-only: ~s", scm_list_1 (orig_str));
492
493 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
494 if (STRINGBUF_SHARED (buf))
495 {
496 /* Clone the stringbuf. */
497 size_t len = STRING_LENGTH (str);
498 SCM new_buf;
499
500 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
501
502 if (scm_i_is_narrow_string (str))
503 {
504 new_buf = make_stringbuf (len);
505 memcpy (STRINGBUF_CHARS (new_buf),
506 STRINGBUF_CHARS (buf) + STRING_START (str), len);
507
508 }
509 else
510 {
511 new_buf = make_wide_stringbuf (len);
512 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
513 (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
514 + STRING_START (str)), len);
515 }
516
517 SET_STRING_STRINGBUF (str, new_buf);
518 start -= STRING_START (str);
519
520 /* FIXME: The following operations are not atomic, so other threads
521 looking at STR may see an inconsistent state. Nevertheless it can't
522 hurt much since (i) accessing STR while it is being mutated can't
523 yield a crash, and (ii) concurrent accesses to STR should be
524 protected by a mutex at the application level. The latter may not
525 apply when STR != ORIG_STR, though. */
526 SET_STRING_START (str, 0);
527 SET_STRING_STRINGBUF (str, new_buf);
528
529 buf = new_buf;
530
531 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
532 }
533 return orig_str;
534}
535
536/* Return a pointer to the 8-bit Latin-1 chars of a string. */
537char *
538scm_i_string_writable_chars (SCM str)
539{
540 SCM buf;
541 size_t start;
542
543 get_str_buf_start (&str, &buf, &start);
544 if (scm_i_is_narrow_string (str))
545 return (char *) STRINGBUF_CHARS (buf) + start;
546 else
547 scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
548 scm_list_1 (str));
549 return NULL;
550}
551
552/* Return a pointer to the UCS-4 codepoints of a string. */
553static scm_t_wchar *
554scm_i_string_writable_wide_chars (SCM str)
555{
556 SCM buf;
557 size_t start;
558
559 get_str_buf_start (&str, &buf, &start);
560 if (!scm_i_is_narrow_string (str))
561 return STRINGBUF_WIDE_CHARS (buf) + start;
562 else
563 scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
564 scm_list_1 (str));
565}
566
567/* Unlock the string mutex that was locked when
568 scm_i_string_start_writing was called. */
569void
570scm_i_string_stop_writing (void)
571{
572 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
573}
574
575/* Return the Xth character of STR as a UCS-4 codepoint. */
576scm_t_wchar
577scm_i_string_ref (SCM str, size_t x)
578{
579 if (scm_i_is_narrow_string (str))
580 return (scm_t_wchar) (unsigned char) (scm_i_string_chars (str)[x]);
581 else
582 return scm_i_string_wide_chars (str)[x];
583}
584
585/* Returns index+1 of the first char in STR that matches C, or
586 0 if the char is not found. */
587int
588scm_i_string_contains_char (SCM str, char ch)
589{
590 size_t i;
591 size_t len = scm_i_string_length (str);
592
593 i = 0;
594 if (scm_i_is_narrow_string (str))
595 {
596 while (i < len)
597 {
598 if (scm_i_string_chars (str)[i] == ch)
599 return i+1;
600 i++;
601 }
602 }
603 else
604 {
605 while (i < len)
606 {
607 if (scm_i_string_wide_chars (str)[i]
608 == (unsigned char) ch)
609 return i+1;
610 i++;
611 }
612 }
613 return 0;
614}
615
616int
617scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
618{
619 if (scm_i_is_narrow_string (sstr))
620 {
621 const char *a = scm_i_string_chars (sstr) + start_x;
622 const char *b = cstr;
623 return strncmp (a, b, strlen(b));
624 }
625 else
626 {
627 size_t i;
628 const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
629 const char *b = cstr;
630 for (i = 0; i < strlen (b); i++)
631 {
632 if (a[i] != (unsigned char) b[i])
633 return 1;
634 }
635 }
636 return 0;
637}
638
639/* Set the Pth character of STR to UCS-4 codepoint CHR. */
640void
641scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
642{
643 if (chr > 0xFF && scm_i_is_narrow_string (str))
644 SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
645
646 if (scm_i_is_narrow_string (str))
647 {
648 char *dst = scm_i_string_writable_chars (str);
649 dst[p] = chr;
650 }
651 else
652 {
653 scm_t_wchar *dst = scm_i_string_writable_wide_chars (str);
654 dst[p] = chr;
655 }
656}
657
658\f
659/* Symbols.
660
661 Basic symbol creation and accessing is done here, the rest is in
662 symbols.[hc]. This has been done to keep stringbufs and the
663 internals of strings and string-like objects confined to this file.
664*/
665
666#define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
667
668SCM
669scm_i_make_symbol (SCM name, scm_t_bits flags,
670 unsigned long hash, SCM props)
671{
672 SCM buf;
673 size_t start = STRING_START (name);
674 size_t length = STRING_LENGTH (name);
675
676 if (IS_SH_STRING (name))
677 {
678 name = SH_STRING_STRING (name);
679 start += STRING_START (name);
680 }
681 buf = SYMBOL_STRINGBUF (name);
682
683 if (start == 0 && length == STRINGBUF_LENGTH (buf))
684 {
685 /* reuse buf. */
686 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
687 SET_STRINGBUF_SHARED (buf);
688 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
689 }
690 else
691 {
692 /* make new buf. */
693 if (scm_i_is_narrow_string (name))
694 {
695 SCM new_buf = make_stringbuf (length);
696 memcpy (STRINGBUF_CHARS (new_buf),
697 STRINGBUF_CHARS (buf) + start, length);
698 buf = new_buf;
699 }
700 else
701 {
702 SCM new_buf = make_wide_stringbuf (length);
703 u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (new_buf),
704 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf) + start,
705 length);
706 buf = new_buf;
707 }
708 }
709 return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
710 (scm_t_bits) hash, SCM_UNPACK (props));
711}
712
713SCM
714scm_i_c_make_symbol (const char *name, size_t len,
715 scm_t_bits flags, unsigned long hash, SCM props)
716{
717 SCM buf = make_stringbuf (len);
718 memcpy (STRINGBUF_CHARS (buf), name, len);
719
720 return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
721 (scm_t_bits) hash, SCM_UNPACK (props));
722}
723
724/* Returns the number of characters in SYM. This may be different
725 from the memory size of SYM. */
726size_t
727scm_i_symbol_length (SCM sym)
728{
729 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
730}
731
732size_t
733scm_c_symbol_length (SCM sym)
734#define FUNC_NAME "scm_c_symbol_length"
735{
736 SCM_VALIDATE_SYMBOL (1, sym);
737
738 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym));
739}
740#undef FUNC_NAME
741
742/* True if the name of SYM is stored as a Latin-1 encoded string.
743 False if it is stored as a 32-bit UCS-4-encoded string. */
744int
745scm_i_is_narrow_symbol (SCM sym)
746{
747 SCM buf;
748
749 buf = SYMBOL_STRINGBUF (sym);
750 return !STRINGBUF_WIDE (buf);
751}
752
753/* Returns a pointer to the 8-bit Latin-1 encoded character array that
754 contains the name of SYM. */
755const char *
756scm_i_symbol_chars (SCM sym)
757{
758 SCM buf;
759
760 buf = SYMBOL_STRINGBUF (sym);
761 if (!STRINGBUF_WIDE (buf))
762 return (const char *) STRINGBUF_CHARS (buf);
763 else
764 scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
765 scm_list_1 (sym));
766}
767
768/* Return a pointer to the 32-bit UCS-4-encoded character array of a
769 symbol's name. */
770const scm_t_wchar *
771scm_i_symbol_wide_chars (SCM sym)
772{
773 SCM buf;
774
775 buf = SYMBOL_STRINGBUF (sym);
776 if (STRINGBUF_WIDE (buf))
777 return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
778 else
779 scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
780 scm_list_1 (sym));
781}
782
783SCM
784scm_i_symbol_substring (SCM sym, size_t start, size_t end)
785{
786 SCM buf = SYMBOL_STRINGBUF (sym);
787 scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
788 SET_STRINGBUF_SHARED (buf);
789 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
790 return scm_double_cell (RO_STRING_TAG, SCM_UNPACK (buf),
791 (scm_t_bits)start, (scm_t_bits) end - start);
792}
793
794/* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
795scm_t_wchar
796scm_i_symbol_ref (SCM sym, size_t x)
797{
798 if (scm_i_is_narrow_symbol (sym))
799 return (scm_t_wchar) (unsigned char) (scm_i_symbol_chars (sym)[x]);
800 else
801 return scm_i_symbol_wide_chars (sym)[x];
802}
803
804/* Debugging
805 */
806
807SCM_DEFINE (scm_sys_string_dump, "%string-dump", 1, 0, 0, (SCM str),
808 "Returns an association list containing debugging information\n"
809 "for @var{str}. The association list has the following entries."
810 "@table @code\n"
811 "@item string\n"
812 "The string itself.\n"
813 "@item start\n"
814 "The start index of the string into its stringbuf\n"
815 "@item length\n"
816 "The length of the string\n"
817 "@item shared\n"
818 "If this string is a substring, it returns its parent string.\n"
819 "Otherwise, it returns @code{#f}\n"
820 "@item read-only\n"
821 "@code{#t} if the string is read-only\n"
822 "@item stringbuf-chars\n"
823 "A new string containing this string's stringbuf's characters\n"
824 "@item stringbuf-length\n"
825 "The number of characters in this stringbuf\n"
826 "@item stringbuf-shared\n"
827 "@code{#t} if this stringbuf is shared\n"
828 "@item stringbuf-wide\n"
829 "@code{#t} if this stringbuf's characters are stored in a\n"
830 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
831 "buffer\n"
832 "@end table")
833#define FUNC_NAME s_scm_sys_string_dump
834{
835 SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
836 SCM buf;
837 SCM_VALIDATE_STRING (1, str);
838
839 /* String info */
840 e1 = scm_cons (scm_from_locale_symbol ("string"),
841 str);
842 e2 = scm_cons (scm_from_locale_symbol ("start"),
843 scm_from_size_t (STRING_START (str)));
844 e3 = scm_cons (scm_from_locale_symbol ("length"),
845 scm_from_size_t (STRING_LENGTH (str)));
846
847 if (IS_SH_STRING (str))
848 {
849 e4 = scm_cons (scm_from_locale_symbol ("shared"),
850 SH_STRING_STRING (str));
851 buf = STRING_STRINGBUF (SH_STRING_STRING (str));
852 }
853 else
854 {
855 e4 = scm_cons (scm_from_locale_symbol ("shared"),
856 SCM_BOOL_F);
857 buf = STRING_STRINGBUF (str);
858 }
859
860 if (IS_RO_STRING (str))
861 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
862 SCM_BOOL_T);
863 else
864 e5 = scm_cons (scm_from_locale_symbol ("read-only"),
865 SCM_BOOL_F);
866
867 /* Stringbuf info */
868 if (!STRINGBUF_WIDE (buf))
869 {
870 size_t len = STRINGBUF_LENGTH (buf);
871 char *cbuf;
872 SCM sbc = scm_i_make_string (len, &cbuf);
873 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
874 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
875 sbc);
876 }
877 else
878 {
879 size_t len = STRINGBUF_LENGTH (buf);
880 scm_t_wchar *cbuf;
881 SCM sbc = scm_i_make_wide_string (len, &cbuf);
882 u32_cpy ((scm_t_uint32 *) cbuf,
883 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
884 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
885 sbc);
886 }
887 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
888 scm_from_size_t (STRINGBUF_LENGTH (buf)));
889 if (STRINGBUF_SHARED (buf))
890 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
891 SCM_BOOL_T);
892 else
893 e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
894 SCM_BOOL_F);
895 if (STRINGBUF_WIDE (buf))
896 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
897 SCM_BOOL_T);
898 else
899 e9 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
900 SCM_BOOL_F);
901
902 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
903}
904#undef FUNC_NAME
905
906SCM_DEFINE (scm_sys_symbol_dump, "%symbol-dump", 1, 0, 0, (SCM sym),
907 "Returns an association list containing debugging information\n"
908 "for @var{sym}. The association list has the following entries."
909 "@table @code\n"
910 "@item symbol\n"
911 "The symbol itself\n"
912 "@item hash\n"
913 "Its hash value\n"
914 "@item interned\n"
915 "@code{#t} if it is an interned symbol\n"
916 "@item stringbuf-chars\n"
917 "A new string containing this symbols's stringbuf's characters\n"
918 "@item stringbuf-length\n"
919 "The number of characters in this stringbuf\n"
920 "@item stringbuf-shared\n"
921 "@code{#t} if this stringbuf is shared\n"
922 "@item stringbuf-wide\n"
923 "@code{#t} if this stringbuf's characters are stored in a\n"
924 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
925 "buffer\n"
926 "@end table")
927#define FUNC_NAME s_scm_sys_symbol_dump
928{
929 SCM e1, e2, e3, e4, e5, e6, e7;
930 SCM buf;
931 SCM_VALIDATE_SYMBOL (1, sym);
932 e1 = scm_cons (scm_from_locale_symbol ("symbol"),
933 sym);
934 e2 = scm_cons (scm_from_locale_symbol ("hash"),
935 scm_from_ulong (scm_i_symbol_hash (sym)));
936 e3 = scm_cons (scm_from_locale_symbol ("interned"),
937 scm_symbol_interned_p (sym));
938 buf = SYMBOL_STRINGBUF (sym);
939
940 /* Stringbuf info */
941 if (!STRINGBUF_WIDE (buf))
942 {
943 size_t len = STRINGBUF_LENGTH (buf);
944 char *cbuf;
945 SCM sbc = scm_i_make_string (len, &cbuf);
946 memcpy (cbuf, STRINGBUF_CHARS (buf), len);
947 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
948 sbc);
949 }
950 else
951 {
952 size_t len = STRINGBUF_LENGTH (buf);
953 scm_t_wchar *cbuf;
954 SCM sbc = scm_i_make_wide_string (len, &cbuf);
955 u32_cpy ((scm_t_uint32 *) cbuf,
956 (scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
957 e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
958 sbc);
959 }
960 e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
961 scm_from_size_t (STRINGBUF_LENGTH (buf)));
962 if (STRINGBUF_SHARED (buf))
963 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
964 SCM_BOOL_T);
965 else
966 e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
967 SCM_BOOL_F);
968 if (STRINGBUF_WIDE (buf))
969 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
970 SCM_BOOL_T);
971 else
972 e7 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
973 SCM_BOOL_F);
974 return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
975
976}
977#undef FUNC_NAME
978
979#ifdef SCM_STRING_LENGTH_HISTOGRAM
980
981SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
982#define FUNC_NAME s_scm_sys_stringbuf_hist
983{
984 int i;
985 for (i = 0; i < 1000; i++)
986 if (lenhist[i])
987 fprintf (stderr, " %3d: %u\n", i, lenhist[i]);
988 fprintf (stderr, ">999: %u\n", lenhist[1000]);
989 return SCM_UNSPECIFIED;
990}
991#undef FUNC_NAME
992
993#endif
994
995\f
996
997SCM_DEFINE (scm_string_p, "string?", 1, 0, 0,
998 (SCM obj),
999 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1000#define FUNC_NAME s_scm_string_p
1001{
1002 return scm_from_bool (IS_STRING (obj));
1003}
1004#undef FUNC_NAME
1005
1006
1007SCM_REGISTER_PROC (s_scm_list_to_string, "list->string", 1, 0, 0, scm_string);
1008
1009SCM_DEFINE (scm_string, "string", 0, 0, 1,
1010 (SCM chrs),
1011 "@deffnx {Scheme Procedure} list->string chrs\n"
1012 "Return a newly allocated string composed of the arguments,\n"
1013 "@var{chrs}.")
1014#define FUNC_NAME s_scm_string
1015{
1016 SCM result = SCM_BOOL_F;
1017 SCM rest;
1018 size_t len;
1019 size_t p = 0;
1020 long i;
1021 int wide = 0;
1022
1023 /* Verify that this is a list of chars. */
1024 i = scm_ilength (chrs);
1025 SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
1026
1027 len = (size_t) i;
1028 rest = chrs;
1029
1030 while (len > 0 && scm_is_pair (rest))
1031 {
1032 SCM elt = SCM_CAR (rest);
1033 SCM_VALIDATE_CHAR (SCM_ARGn, elt);
1034 if (SCM_CHAR (elt) > 0xFF)
1035 wide = 1;
1036 rest = SCM_CDR (rest);
1037 len--;
1038 scm_remember_upto_here_1 (elt);
1039 }
1040
1041 /* Construct a string containing this list of chars. */
1042 len = (size_t) i;
1043 rest = chrs;
1044
1045 if (wide == 0)
1046 {
1047 char *buf;
1048
1049 result = scm_i_make_string (len, NULL);
1050 result = scm_i_string_start_writing (result);
1051 buf = scm_i_string_writable_chars (result);
1052 while (len > 0 && scm_is_pair (rest))
1053 {
1054 SCM elt = SCM_CAR (rest);
1055 buf[p] = (unsigned char) SCM_CHAR (elt);
1056 p++;
1057 rest = SCM_CDR (rest);
1058 len--;
1059 scm_remember_upto_here_1 (elt);
1060 }
1061 }
1062 else
1063 {
1064 scm_t_wchar *buf;
1065
1066 result = scm_i_make_wide_string (len, NULL);
1067 result = scm_i_string_start_writing (result);
1068 buf = scm_i_string_writable_wide_chars (result);
1069 while (len > 0 && scm_is_pair (rest))
1070 {
1071 SCM elt = SCM_CAR (rest);
1072 buf[p] = SCM_CHAR (elt);
1073 p++;
1074 rest = SCM_CDR (rest);
1075 len--;
1076 scm_remember_upto_here_1 (elt);
1077 }
1078 }
1079 scm_i_string_stop_writing ();
1080
1081 if (len > 0)
1082 scm_misc_error (NULL, "list changed while constructing string", SCM_EOL);
1083 if (!scm_is_null (rest))
1084 scm_wrong_type_arg_msg (NULL, 0, chrs, "proper list");
1085
1086 return result;
1087}
1088#undef FUNC_NAME
1089
1090SCM_DEFINE (scm_make_string, "make-string", 1, 1, 0,
1091 (SCM k, SCM chr),
1092 "Return a newly allocated string of\n"
1093 "length @var{k}. If @var{chr} is given, then all elements of\n"
1094 "the string are initialized to @var{chr}, otherwise the contents\n"
1095 "of the @var{string} are unspecified.")
1096#define FUNC_NAME s_scm_make_string
1097{
1098 return scm_c_make_string (scm_to_size_t (k), chr);
1099}
1100#undef FUNC_NAME
1101
1102SCM
1103scm_c_make_string (size_t len, SCM chr)
1104#define FUNC_NAME NULL
1105{
1106 size_t p;
1107 SCM res = scm_i_make_string (len, NULL);
1108
1109 if (!SCM_UNBNDP (chr))
1110 {
1111 SCM_VALIDATE_CHAR (0, chr);
1112 res = scm_i_string_start_writing (res);
1113 for (p = 0; p < len; p++)
1114 scm_i_string_set_x (res, p, SCM_CHAR (chr));
1115 scm_i_string_stop_writing ();
1116 }
1117
1118 return res;
1119}
1120#undef FUNC_NAME
1121
1122SCM_DEFINE (scm_string_length, "string-length", 1, 0, 0,
1123 (SCM string),
1124 "Return the number of characters in @var{string}.")
1125#define FUNC_NAME s_scm_string_length
1126{
1127 SCM_VALIDATE_STRING (1, string);
1128 return scm_from_size_t (STRING_LENGTH (string));
1129}
1130#undef FUNC_NAME
1131
1132SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
1133 (SCM string),
1134 "Return the bytes used to represent a character in @var{string}."
1135 "This will return 1 or 4.")
1136#define FUNC_NAME s_scm_string_bytes_per_char
1137{
1138 SCM_VALIDATE_STRING (1, string);
1139 if (!scm_i_is_narrow_string (string))
1140 return scm_from_int (4);
1141
1142 return scm_from_int (1);
1143}
1144#undef FUNC_NAME
1145
1146size_t
1147scm_c_string_length (SCM string)
1148{
1149 if (!IS_STRING (string))
1150 scm_wrong_type_arg_msg (NULL, 0, string, "string");
1151 return STRING_LENGTH (string);
1152}
1153
1154SCM_DEFINE (scm_string_ref, "string-ref", 2, 0, 0,
1155 (SCM str, SCM k),
1156 "Return character @var{k} of @var{str} using zero-origin\n"
1157 "indexing. @var{k} must be a valid index of @var{str}.")
1158#define FUNC_NAME s_scm_string_ref
1159{
1160 size_t len;
1161 unsigned long idx;
1162
1163 SCM_VALIDATE_STRING (1, str);
1164
1165 len = scm_i_string_length (str);
1166 if (SCM_LIKELY (len > 0))
1167 idx = scm_to_unsigned_integer (k, 0, len - 1);
1168 else
1169 scm_out_of_range (NULL, k);
1170
1171 if (scm_i_is_narrow_string (str))
1172 return SCM_MAKE_CHAR (scm_i_string_chars (str)[idx]);
1173 else
1174 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[idx]);
1175}
1176#undef FUNC_NAME
1177
1178SCM
1179scm_c_string_ref (SCM str, size_t p)
1180{
1181 if (p >= scm_i_string_length (str))
1182 scm_out_of_range (NULL, scm_from_size_t (p));
1183 if (scm_i_is_narrow_string (str))
1184 return SCM_MAKE_CHAR (scm_i_string_chars (str)[p]);
1185 else
1186 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str)[p]);
1187
1188}
1189
1190SCM_DEFINE (scm_string_set_x, "string-set!", 3, 0, 0,
1191 (SCM str, SCM k, SCM chr),
1192 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1193 "an unspecified value. @var{k} must be a valid index of\n"
1194 "@var{str}.")
1195#define FUNC_NAME s_scm_string_set_x
1196{
1197 size_t len;
1198 unsigned long idx;
1199
1200 SCM_VALIDATE_STRING (1, str);
1201
1202 len = scm_i_string_length (str);
1203 if (SCM_LIKELY (len > 0))
1204 idx = scm_to_unsigned_integer (k, 0, len - 1);
1205 else
1206 scm_out_of_range (NULL, k);
1207
1208 SCM_VALIDATE_CHAR (3, chr);
1209 str = scm_i_string_start_writing (str);
1210 scm_i_string_set_x (str, idx, SCM_CHAR (chr));
1211 scm_i_string_stop_writing ();
1212
1213 return SCM_UNSPECIFIED;
1214}
1215#undef FUNC_NAME
1216
1217void
1218scm_c_string_set_x (SCM str, size_t p, SCM chr)
1219{
1220 if (p >= scm_i_string_length (str))
1221 scm_out_of_range (NULL, scm_from_size_t (p));
1222 str = scm_i_string_start_writing (str);
1223 scm_i_string_set_x (str, p, SCM_CHAR (chr));
1224 scm_i_string_stop_writing ();
1225}
1226
1227SCM_DEFINE (scm_substring, "substring", 2, 1, 0,
1228 (SCM str, SCM start, SCM end),
1229 "Return a newly allocated string formed from the characters\n"
1230 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1231 "ending with index @var{end} (exclusive).\n"
1232 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1233 "exact integers satisfying:\n\n"
1234 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1235#define FUNC_NAME s_scm_substring
1236{
1237 size_t len, from, to;
1238
1239 SCM_VALIDATE_STRING (1, str);
1240 len = scm_i_string_length (str);
1241 from = scm_to_unsigned_integer (start, 0, len);
1242 if (SCM_UNBNDP (end))
1243 to = len;
1244 else
1245 to = scm_to_unsigned_integer (end, from, len);
1246 return scm_i_substring (str, from, to);
1247}
1248#undef FUNC_NAME
1249
1250SCM_DEFINE (scm_substring_read_only, "substring/read-only", 2, 1, 0,
1251 (SCM str, SCM start, SCM end),
1252 "Return a newly allocated string formed from the characters\n"
1253 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1254 "ending with index @var{end} (exclusive).\n"
1255 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1256 "exact integers satisfying:\n"
1257 "\n"
1258 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1259 "\n"
1260 "The returned string is read-only.\n")
1261#define FUNC_NAME s_scm_substring_read_only
1262{
1263 size_t len, from, to;
1264
1265 SCM_VALIDATE_STRING (1, str);
1266 len = scm_i_string_length (str);
1267 from = scm_to_unsigned_integer (start, 0, len);
1268 if (SCM_UNBNDP (end))
1269 to = len;
1270 else
1271 to = scm_to_unsigned_integer (end, from, len);
1272 return scm_i_substring_read_only (str, from, to);
1273}
1274#undef FUNC_NAME
1275
1276SCM_DEFINE (scm_substring_copy, "substring/copy", 2, 1, 0,
1277 (SCM str, SCM start, SCM end),
1278 "Return a newly allocated string formed from the characters\n"
1279 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1280 "ending with index @var{end} (exclusive).\n"
1281 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1282 "exact integers satisfying:\n\n"
1283 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1284#define FUNC_NAME s_scm_substring_copy
1285{
1286 /* For the Scheme version, START is mandatory, but for the C
1287 version, it is optional. See scm_string_copy in srfi-13.c for a
1288 rationale.
1289 */
1290
1291 size_t from, to;
1292
1293 SCM_VALIDATE_STRING (1, str);
1294 scm_i_get_substring_spec (scm_i_string_length (str),
1295 start, &from, end, &to);
1296 return scm_i_substring_copy (str, from, to);
1297}
1298#undef FUNC_NAME
1299
1300SCM_DEFINE (scm_substring_shared, "substring/shared", 2, 1, 0,
1301 (SCM str, SCM start, SCM end),
1302 "Return string that indirectly refers to the characters\n"
1303 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1304 "ending with index @var{end} (exclusive).\n"
1305 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1306 "exact integers satisfying:\n\n"
1307 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1308#define FUNC_NAME s_scm_substring_shared
1309{
1310 size_t len, from, to;
1311
1312 SCM_VALIDATE_STRING (1, str);
1313 len = scm_i_string_length (str);
1314 from = scm_to_unsigned_integer (start, 0, len);
1315 if (SCM_UNBNDP (end))
1316 to = len;
1317 else
1318 to = scm_to_unsigned_integer (end, from, len);
1319 return scm_i_substring_shared (str, from, to);
1320}
1321#undef FUNC_NAME
1322
1323SCM_DEFINE (scm_string_append, "string-append", 0, 0, 1,
1324 (SCM args),
1325 "Return a newly allocated string whose characters form the\n"
1326 "concatenation of the given strings, @var{args}.")
1327#define FUNC_NAME s_scm_string_append
1328{
1329 SCM res;
1330 size_t len = 0;
1331 int wide = 0;
1332 SCM l, s;
1333 size_t i;
1334 union
1335 {
1336 char *narrow;
1337 scm_t_wchar *wide;
1338 } data;
1339
1340 SCM_VALIDATE_REST_ARGUMENT (args);
1341 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1342 {
1343 s = SCM_CAR (l);
1344 SCM_VALIDATE_STRING (SCM_ARGn, s);
1345 len += scm_i_string_length (s);
1346 if (!scm_i_is_narrow_string (s))
1347 wide = 1;
1348 }
1349 data.narrow = NULL;
1350 if (!wide)
1351 res = scm_i_make_string (len, &data.narrow);
1352 else
1353 res = scm_i_make_wide_string (len, &data.wide);
1354
1355 for (l = args; !scm_is_null (l); l = SCM_CDR (l))
1356 {
1357 size_t len;
1358 s = SCM_CAR (l);
1359 SCM_VALIDATE_STRING (SCM_ARGn, s);
1360 len = scm_i_string_length (s);
1361 if (!wide)
1362 {
1363 memcpy (data.narrow, scm_i_string_chars (s), len);
1364 data.narrow += len;
1365 }
1366 else
1367 {
1368 if (scm_i_is_narrow_string (s))
1369 {
1370 for (i = 0; i < scm_i_string_length (s); i++)
1371 data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
1372 }
1373 else
1374 u32_cpy ((scm_t_uint32 *) data.wide,
1375 (scm_t_uint32 *) scm_i_string_wide_chars (s), len);
1376 data.wide += len;
1377 }
1378 scm_remember_upto_here_1 (s);
1379 }
1380 return res;
1381}
1382#undef FUNC_NAME
1383
1384int
1385scm_is_string (SCM obj)
1386{
1387 return IS_STRING (obj);
1388}
1389
1390\f
1391/* Conversion to/from other encodings. */
1392
1393SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
1394static void
1395scm_encoding_error (const char *subr, const char *message, SCM args)
1396{
1397 scm_error (scm_encoding_error_key, subr, message, args, SCM_BOOL_F);
1398}
1399
1400SCM
1401scm_from_stringn (const char *str, size_t len, const char *encoding,
1402 scm_t_string_failed_conversion_handler handler)
1403{
1404 size_t u32len, i;
1405 scm_t_wchar *u32;
1406 int wide = 0;
1407 SCM res;
1408
1409 if (len == 0)
1410 return scm_nullstr;
1411
1412 if (encoding == NULL)
1413 {
1414 /* If encoding is null, use Latin-1. */
1415 char *buf;
1416 res = scm_i_make_string (len, &buf);
1417 memcpy (buf, str, len);
1418 return res;
1419 }
1420
1421 u32len = 0;
1422 u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
1423 (enum iconv_ilseq_handler)
1424 handler,
1425 str, len,
1426 NULL,
1427 NULL, &u32len);
1428
1429 if (u32 == NULL)
1430 {
1431 if (errno == ENOMEM)
1432 scm_memory_error ("locale string conversion");
1433 else
1434 {
1435 /* There are invalid sequences in the input string. */
1436 SCM errstr;
1437 char *dst;
1438 errstr = scm_i_make_string (len, &dst);
1439 memcpy (dst, str, len);
1440 scm_encoding_error (NULL,
1441 "input locale conversion error from ~s: ~s",
1442 scm_list_2 (scm_from_locale_string (encoding),
1443 errstr));
1444 scm_remember_upto_here_1 (errstr);
1445 }
1446 }
1447
1448 i = 0;
1449 while (i < u32len)
1450 if (u32[i++] > 0xFF)
1451 {
1452 wide = 1;
1453 break;
1454 }
1455
1456 if (!wide)
1457 {
1458 char *dst;
1459 res = scm_i_make_string (u32len, &dst);
1460 for (i = 0; i < u32len; i ++)
1461 dst[i] = (unsigned char) u32[i];
1462 dst[u32len] = '\0';
1463 }
1464 else
1465 {
1466 scm_t_wchar *wdst;
1467 res = scm_i_make_wide_string (u32len, &wdst);
1468 u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
1469 wdst[u32len] = 0;
1470 }
1471
1472 free (u32);
1473 return res;
1474}
1475
1476SCM
1477scm_from_locale_stringn (const char *str, size_t len)
1478{
1479 const char *enc;
1480 scm_t_string_failed_conversion_handler hndl;
1481 SCM inport;
1482 scm_t_port *pt;
1483
1484 if (len == (size_t) -1)
1485 len = strlen (str);
1486 if (len == 0)
1487 return scm_nullstr;
1488
1489 inport = scm_current_input_port ();
1490 if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
1491 {
1492 pt = SCM_PTAB_ENTRY (inport);
1493 enc = pt->encoding;
1494 hndl = pt->ilseq_handler;
1495 }
1496 else
1497 {
1498 enc = NULL;
1499 hndl = SCM_FAILED_CONVERSION_ERROR;
1500 }
1501
1502 return scm_from_stringn (str, len, enc, hndl);
1503}
1504
1505SCM
1506scm_from_locale_string (const char *str)
1507{
1508 if (str == NULL)
1509 return scm_nullstr;
1510
1511 return scm_from_locale_stringn (str, -1);
1512}
1513
1514SCM
1515scm_i_from_utf8_string (const scm_t_uint8 *str)
1516{
1517 return scm_from_stringn ((const char *) str,
1518 strlen ((char *) str), "UTF-8",
1519 SCM_FAILED_CONVERSION_ERROR);
1520}
1521
1522/* Create a new scheme string from the C string STR. The memory of
1523 STR may be used directly as storage for the new string. */
1524/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1525 would be to register a finalizer to eventually free(3) STR, which isn't
1526 worth it. Should we just deprecate the `scm_take_' functions? */
1527SCM
1528scm_take_locale_stringn (char *str, size_t len)
1529{
1530 SCM res;
1531
1532 res = scm_from_locale_stringn (str, len);
1533 free (str);
1534
1535 return res;
1536}
1537
1538SCM
1539scm_take_locale_string (char *str)
1540{
1541 return scm_take_locale_stringn (str, -1);
1542}
1543
1544/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
1545 and \UXXXXXX. */
1546static void
1547unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
1548{
1549 char *before, *after;
1550 size_t i, j;
1551
1552 before = *bufp;
1553 after = *bufp;
1554 i = 0;
1555 j = 0;
1556 while (i < *lenp)
1557 {
1558 if ((i <= *lenp - 6)
1559 && before[i] == '\\'
1560 && before[i + 1] == 'u'
1561 && before[i + 2] == '0' && before[i + 3] == '0')
1562 {
1563 /* Convert \u00NN to \xNN */
1564 after[j] = '\\';
1565 after[j + 1] = 'x';
1566 after[j + 2] = tolower ((int) before[i + 4]);
1567 after[j + 3] = tolower ((int) before[i + 5]);
1568 i += 6;
1569 j += 4;
1570 }
1571 else if ((i <= *lenp - 10)
1572 && before[i] == '\\'
1573 && before[i + 1] == 'U'
1574 && before[i + 2] == '0' && before[i + 3] == '0')
1575 {
1576 /* Convert \U00NNNNNN to \UNNNNNN */
1577 after[j] = '\\';
1578 after[j + 1] = 'U';
1579 after[j + 2] = tolower ((int) before[i + 4]);
1580 after[j + 3] = tolower ((int) before[i + 5]);
1581 after[j + 4] = tolower ((int) before[i + 6]);
1582 after[j + 5] = tolower ((int) before[i + 7]);
1583 after[j + 6] = tolower ((int) before[i + 8]);
1584 after[j + 7] = tolower ((int) before[i + 9]);
1585 i += 10;
1586 j += 8;
1587 }
1588 else
1589 {
1590 after[j] = before[i];
1591 i++;
1592 j++;
1593 }
1594 }
1595 *lenp = j;
1596 after = scm_realloc (after, j);
1597}
1598
1599char *
1600scm_to_locale_stringn (SCM str, size_t *lenp)
1601{
1602 SCM outport;
1603 scm_t_port *pt;
1604 const char *enc;
1605
1606 outport = scm_current_output_port ();
1607 if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
1608 {
1609 pt = SCM_PTAB_ENTRY (outport);
1610 enc = pt->encoding;
1611 }
1612 else
1613 enc = NULL;
1614
1615 return scm_to_stringn (str, lenp,
1616 enc,
1617 scm_i_get_conversion_strategy (SCM_BOOL_F));
1618}
1619
1620/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1621 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1622 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1623 defined by HANDLER. */
1624char *
1625scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
1626 scm_t_string_failed_conversion_handler handler)
1627{
1628 char *buf;
1629 size_t ilen, len, i;
1630 int ret;
1631 const char *enc;
1632
1633 if (!scm_is_string (str))
1634 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1635 ilen = scm_i_string_length (str);
1636
1637 if (ilen == 0)
1638 {
1639 buf = scm_malloc (1);
1640 buf[0] = '\0';
1641 if (lenp)
1642 *lenp = 0;
1643 return buf;
1644 }
1645
1646 if (lenp == NULL)
1647 for (i = 0; i < ilen; i++)
1648 if (scm_i_string_ref (str, i) == '\0')
1649 scm_misc_error (NULL,
1650 "string contains #\\nul character: ~S",
1651 scm_list_1 (str));
1652
1653 if (scm_i_is_narrow_string (str) && (encoding == NULL))
1654 {
1655 /* If using native Latin-1 encoding, just copy the string
1656 contents. */
1657 if (lenp)
1658 {
1659 buf = scm_malloc (ilen);
1660 memcpy (buf, scm_i_string_chars (str), ilen);
1661 *lenp = ilen;
1662 return buf;
1663 }
1664 else
1665 {
1666 buf = scm_malloc (ilen + 1);
1667 memcpy (buf, scm_i_string_chars (str), ilen);
1668 buf[ilen] = '\0';
1669 return buf;
1670 }
1671 }
1672
1673
1674 buf = NULL;
1675 len = 0;
1676 enc = encoding;
1677 if (enc == NULL)
1678 enc = "ISO-8859-1";
1679 if (scm_i_is_narrow_string (str))
1680 {
1681 ret = mem_iconveh (scm_i_string_chars (str), ilen,
1682 "ISO-8859-1", enc,
1683 (enum iconv_ilseq_handler) handler, NULL,
1684 &buf, &len);
1685
1686 if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1687 unistring_escapes_to_guile_escapes (&buf, &len);
1688
1689 if (ret != 0)
1690 scm_encoding_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1691 scm_list_2 (scm_from_locale_string (enc), str));
1692 }
1693 else
1694 {
1695 buf = u32_conv_to_encoding (enc,
1696 (enum iconv_ilseq_handler) handler,
1697 (scm_t_uint32 *) scm_i_string_wide_chars (str),
1698 ilen,
1699 NULL,
1700 NULL, &len);
1701 if (buf == NULL)
1702 scm_encoding_error (NULL, "cannot convert to output locale ~s: \"~s\"",
1703 scm_list_2 (scm_from_locale_string (enc), str));
1704
1705 if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
1706 unistring_escapes_to_guile_escapes (&buf, &len);
1707 }
1708 if (lenp)
1709 *lenp = len;
1710 else
1711 {
1712 buf = scm_realloc (buf, len + 1);
1713 buf[len] = '\0';
1714 }
1715
1716 scm_remember_upto_here_1 (str);
1717 return buf;
1718}
1719
1720char *
1721scm_to_locale_string (SCM str)
1722{
1723 return scm_to_locale_stringn (str, NULL);
1724}
1725
1726scm_t_uint8 *
1727scm_i_to_utf8_string (SCM str)
1728{
1729 char *u8str;
1730 u8str = scm_to_stringn (str, NULL, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
1731 return (scm_t_uint8 *) u8str;
1732}
1733
1734size_t
1735scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
1736{
1737 size_t len;
1738 char *result = NULL;
1739 if (!scm_is_string (str))
1740 scm_wrong_type_arg_msg (NULL, 0, str, "string");
1741 result = scm_to_locale_stringn (str, &len);
1742
1743 memcpy (buf, result, (len > max_len) ? max_len : len);
1744 free (result);
1745
1746 scm_remember_upto_here_1 (str);
1747 return len;
1748}
1749
1750\f
1751/* Unicode string normalization. */
1752
1753/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
1754 libguile/i18n.c. It would be useful to have this factored out into a more
1755 convenient location, but its use of alloca makes that tricky to do. */
1756
1757static SCM
1758normalize_str (SCM string, uninorm_t form)
1759{
1760 SCM ret;
1761 scm_t_uint32 *w_str;
1762 scm_t_wchar *cbuf;
1763 size_t rlen, len = scm_i_string_length (string);
1764
1765 if (scm_i_is_narrow_string (string))
1766 {
1767 size_t i;
1768 const char *buf = scm_i_string_chars (string);
1769
1770 w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
1771
1772 for (i = 0; i < len; i ++)
1773 w_str[i] = (unsigned char) buf[i];
1774 w_str[len] = 0;
1775 }
1776 else
1777 w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
1778
1779 w_str = u32_normalize (form, w_str, len, NULL, &rlen);
1780
1781 ret = scm_i_make_wide_string (rlen, &cbuf);
1782 u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
1783 free (w_str);
1784
1785 scm_i_try_narrow_string (ret);
1786
1787 return ret;
1788}
1789
1790SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
1791 (SCM string),
1792 "Returns the NFC normalized form of @var{string}.")
1793#define FUNC_NAME s_scm_string_normalize_nfc
1794{
1795 SCM_VALIDATE_STRING (1, string);
1796 return normalize_str (string, UNINORM_NFC);
1797}
1798#undef FUNC_NAME
1799
1800SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
1801 (SCM string),
1802 "Returns the NFD normalized form of @var{string}.")
1803#define FUNC_NAME s_scm_string_normalize_nfd
1804{
1805 SCM_VALIDATE_STRING (1, string);
1806 return normalize_str (string, UNINORM_NFD);
1807}
1808#undef FUNC_NAME
1809
1810SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
1811 (SCM string),
1812 "Returns the NFKC normalized form of @var{string}.")
1813#define FUNC_NAME s_scm_string_normalize_nfkc
1814{
1815 SCM_VALIDATE_STRING (1, string);
1816 return normalize_str (string, UNINORM_NFKC);
1817}
1818#undef FUNC_NAME
1819
1820SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
1821 (SCM string),
1822 "Returns the NFKD normalized form of @var{string}.")
1823#define FUNC_NAME s_scm_string_normalize_nfkd
1824{
1825 SCM_VALIDATE_STRING (1, string);
1826 return normalize_str (string, UNINORM_NFKD);
1827}
1828#undef FUNC_NAME
1829
1830/* converts C scm_array of strings to SCM scm_list of strings. */
1831/* If argc < 0, a null terminated scm_array is assumed. */
1832SCM
1833scm_makfromstrs (int argc, char **argv)
1834{
1835 int i = argc;
1836 SCM lst = SCM_EOL;
1837 if (0 > i)
1838 for (i = 0; argv[i]; i++);
1839 while (i--)
1840 lst = scm_cons (scm_from_locale_string (argv[i]), lst);
1841 return lst;
1842}
1843
1844/* Return a newly allocated array of char pointers to each of the strings
1845 in args, with a terminating NULL pointer. */
1846
1847char **
1848scm_i_allocate_string_pointers (SCM list)
1849#define FUNC_NAME "scm_i_allocate_string_pointers"
1850{
1851 char **result;
1852 int len = scm_ilength (list);
1853 int i;
1854
1855 if (len < 0)
1856 scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
1857
1858 result = scm_gc_malloc ((len + 1) * sizeof (char *),
1859 "string pointers");
1860 result[len] = NULL;
1861
1862 /* The list might be have been modified in another thread, so
1863 we check LIST before each access.
1864 */
1865 for (i = 0; i < len && scm_is_pair (list); i++)
1866 {
1867 SCM str;
1868 size_t len;
1869
1870 str = SCM_CAR (list);
1871 len = scm_c_string_length (str);
1872
1873 result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
1874 memcpy (result[i], scm_i_string_chars (str), len);
1875 result[i][len] = '\0';
1876
1877 list = SCM_CDR (list);
1878 }
1879
1880 return result;
1881}
1882#undef FUNC_NAME
1883
1884void
1885scm_i_get_substring_spec (size_t len,
1886 SCM start, size_t *cstart,
1887 SCM end, size_t *cend)
1888{
1889 if (SCM_UNBNDP (start))
1890 *cstart = 0;
1891 else
1892 *cstart = scm_to_unsigned_integer (start, 0, len);
1893
1894 if (SCM_UNBNDP (end))
1895 *cend = len;
1896 else
1897 *cend = scm_to_unsigned_integer (end, *cstart, len);
1898}
1899
1900#if SCM_ENABLE_DEPRECATED
1901
1902/* When these definitions are removed, it becomes reasonable to use
1903 read-only strings for string literals. For that, change the reader
1904 to create string literals with scm_c_substring_read_only instead of
1905 with scm_c_substring_copy.
1906*/
1907
1908int
1909scm_i_deprecated_stringp (SCM str)
1910{
1911 scm_c_issue_deprecation_warning
1912 ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
1913
1914 return scm_is_string (str);
1915}
1916
1917char *
1918scm_i_deprecated_string_chars (SCM str)
1919{
1920 char *chars;
1921
1922 scm_c_issue_deprecation_warning
1923 ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
1924
1925 /* We don't accept shared substrings here since they are not
1926 null-terminated.
1927 */
1928 if (IS_SH_STRING (str))
1929 scm_misc_error (NULL,
1930 "SCM_STRING_CHARS does not work with shared substrings",
1931 SCM_EOL);
1932
1933 /* We explicitly test for read-only strings to produce a better
1934 error message.
1935 */
1936
1937 if (IS_RO_STRING (str))
1938 scm_misc_error (NULL,
1939 "SCM_STRING_CHARS does not work with read-only strings",
1940 SCM_EOL);
1941
1942 /* The following is still wrong, of course...
1943 */
1944 str = scm_i_string_start_writing (str);
1945 chars = scm_i_string_writable_chars (str);
1946 scm_i_string_stop_writing ();
1947 return chars;
1948}
1949
1950size_t
1951scm_i_deprecated_string_length (SCM str)
1952{
1953 scm_c_issue_deprecation_warning
1954 ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
1955 return scm_c_string_length (str);
1956}
1957
1958#endif
1959
1960static SCM
1961string_handle_ref (scm_t_array_handle *h, size_t index)
1962{
1963 return scm_c_string_ref (h->array, index);
1964}
1965
1966static void
1967string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
1968{
1969 scm_c_string_set_x (h->array, index, val);
1970}
1971
1972static void
1973string_get_handle (SCM v, scm_t_array_handle *h)
1974{
1975 h->array = v;
1976 h->ndims = 1;
1977 h->dims = &h->dim0;
1978 h->dim0.lbnd = 0;
1979 h->dim0.ubnd = scm_c_string_length (v) - 1;
1980 h->dim0.inc = 1;
1981 h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
1982 h->elements = h->writable_elements = NULL;
1983}
1984
1985SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
1986 string_handle_ref, string_handle_set,
1987 string_get_handle)
1988SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
1989
1990void
1991scm_init_strings ()
1992{
1993 scm_nullstr = scm_i_make_string (0, NULL);
1994
1995#include "libguile/strings.x"
1996}
1997
1998
1999/*
2000 Local Variables:
2001 c-file-style: "gnu"
2002 End:
2003*/