1 /* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
33 #include "striconveh.h"
35 #include "libguile/_scm.h"
36 #include "libguile/chars.h"
37 #include "libguile/root.h"
38 #include "libguile/strings.h"
39 #include "libguile/error.h"
40 #include "libguile/generalized-vectors.h"
41 #include "libguile/deprecation.h"
42 #include "libguile/validate.h"
43 #include "libguile/private-options.h"
53 * XXX - keeping an accurate refcount during GC seems to be quite
54 * tricky, so we just keep score of whether a stringbuf might be
55 * shared, not whether it definitely is.
57 * The scheme I (mvo) tried to keep an accurate reference count would
58 * recount all strings that point to a stringbuf during the mark-phase
59 * of the GC. This was done since one cannot access the stringbuf of
60 * a string when that string is freed (in order to decrease the
61 * reference count). The memory of the stringbuf might have been
62 * reused already for something completely different.
64 * This recounted worked for a small number of threads beating on
65 * cow-strings, but it failed randomly with more than 10 threads, say.
66 * I couldn't figure out what went wrong, so I used the conservative
67 * approach implemented below.
69 * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
70 * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
74 /* The size in words of the stringbuf header (type tag + size). */
75 #define STRINGBUF_HEADER_SIZE 2U
77 #define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
79 #define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
80 #define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
82 #define STRINGBUF_TAG scm_tc7_stringbuf
83 #define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
84 #define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
86 #define STRINGBUF_CONTENTS(buf) ((void *) \
87 SCM_CELL_OBJECT_LOC (buf, \
88 STRINGBUF_HEADER_SIZE))
89 #define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
90 #define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
92 #define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
94 #define SET_STRINGBUF_SHARED(buf) \
97 /* Don't modify BUF if it's already marked as shared since it might be \
98 a read-only, statically allocated stringbuf. */ \
99 if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
100 SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
104 #ifdef SCM_STRING_LENGTH_HISTOGRAM
105 static size_t lenhist
[1001];
108 /* Make a stringbuf with space for LEN 8-bit Latin-1-encoded
111 make_stringbuf (size_t len
)
113 /* XXX - for the benefit of SCM_STRING_CHARS, SCM_SYMBOL_CHARS and
114 scm_i_symbol_chars, all stringbufs are null-terminated. Once
115 SCM_STRING_CHARS and SCM_SYMBOL_CHARS are removed and the code
116 has been changed for scm_i_symbol_chars, this null-termination
122 #ifdef SCM_STRING_LENGTH_HISTOGRAM
129 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ len
+ 1,
132 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
);
133 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
135 STRINGBUF_CHARS (buf
)[len
] = 0;
140 /* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
143 make_wide_stringbuf (size_t len
)
148 #ifdef SCM_STRING_LENGTH_HISTOGRAM
155 raw_len
= (len
+ 1) * sizeof (scm_t_wchar
);
156 buf
= SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES
+ raw_len
,
159 SCM_SET_CELL_TYPE (buf
, STRINGBUF_TAG
| STRINGBUF_F_WIDE
);
160 SCM_SET_CELL_WORD_1 (buf
, (scm_t_bits
) len
);
162 STRINGBUF_WIDE_CHARS (buf
)[len
] = 0;
167 /* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
168 characters from BUF. */
170 wide_stringbuf (SCM buf
)
174 if (STRINGBUF_WIDE (buf
))
181 len
= STRINGBUF_LENGTH (buf
);
183 new_buf
= make_wide_stringbuf (len
);
185 mem
= STRINGBUF_WIDE_CHARS (new_buf
);
186 for (i
= 0; i
< len
; i
++)
187 mem
[i
] = (scm_t_wchar
) STRINGBUF_CHARS (buf
)[i
];
194 /* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
195 characters from BUF, if possible. */
197 narrow_stringbuf (SCM buf
)
201 if (!STRINGBUF_WIDE (buf
))
209 len
= STRINGBUF_LENGTH (buf
);
210 wmem
= STRINGBUF_WIDE_CHARS (buf
);
212 for (i
= 0; i
< len
; i
++)
214 /* BUF cannot be narrowed. */
217 new_buf
= make_stringbuf (len
);
219 mem
= STRINGBUF_CHARS (new_buf
);
220 for (i
= 0; i
< len
; i
++)
221 mem
[i
] = (unsigned char) wmem
[i
];
228 scm_i_pthread_mutex_t stringbuf_write_mutex
= SCM_I_PTHREAD_MUTEX_INITIALIZER
;
231 /* Copy-on-write strings.
234 #define STRING_TAG scm_tc7_string
236 #define STRING_STRINGBUF(str) (SCM_CELL_OBJECT_1(str))
237 #define STRING_START(str) ((size_t)SCM_CELL_WORD_2(str))
238 #define STRING_LENGTH(str) ((size_t)SCM_CELL_WORD_3(str))
240 #define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
241 #define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
243 #define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
245 /* Read-only strings.
248 #define RO_STRING_TAG scm_tc7_ro_string
249 #define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
251 /* Mutation-sharing substrings
254 #define SH_STRING_TAG (scm_tc7_string + 0x100)
256 #define SH_STRING_STRING(sh) (SCM_CELL_OBJECT_1(sh))
257 /* START and LENGTH as for STRINGs. */
259 #define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
263 /* Create a scheme string with space for LEN 8-bit Latin-1-encoded
264 characters. CHARSP, if not NULL, will be set to location of the
265 char array. If READ_ONLY_P, the returned string is read-only;
266 otherwise it is writable. */
268 scm_i_make_string (size_t len
, char **charsp
, int read_only_p
)
270 SCM buf
= make_stringbuf (len
);
273 *charsp
= (char *) STRINGBUF_CHARS (buf
);
274 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
276 (scm_t_bits
) 0, (scm_t_bits
) len
);
280 /* Create a scheme string with space for LEN 32-bit UCS-4-encoded
281 characters. CHARSP, if not NULL, will be set to location of the
282 character array. If READ_ONLY_P, the returned string is read-only;
283 otherwise it is writable. */
285 scm_i_make_wide_string (size_t len
, scm_t_wchar
**charsp
, int read_only_p
)
287 SCM buf
= make_wide_stringbuf (len
);
290 *charsp
= STRINGBUF_WIDE_CHARS (buf
);
291 res
= scm_double_cell (read_only_p
? RO_STRING_TAG
: STRING_TAG
,
293 (scm_t_bits
) 0, (scm_t_bits
) len
);
298 validate_substring_args (SCM str
, size_t start
, size_t end
)
300 if (!IS_STRING (str
))
301 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
302 if (start
> STRING_LENGTH (str
))
303 scm_out_of_range (NULL
, scm_from_size_t (start
));
304 if (end
> STRING_LENGTH (str
) || end
< start
)
305 scm_out_of_range (NULL
, scm_from_size_t (end
));
309 get_str_buf_start (SCM
*str
, SCM
*buf
, size_t *start
)
311 *start
= STRING_START (*str
);
312 if (IS_SH_STRING (*str
))
314 *str
= SH_STRING_STRING (*str
);
315 *start
+= STRING_START (*str
);
317 *buf
= STRING_STRINGBUF (*str
);
321 scm_i_substring (SCM str
, size_t start
, size_t end
)
325 get_str_buf_start (&str
, &buf
, &str_start
);
326 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
327 SET_STRINGBUF_SHARED (buf
);
328 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
329 return scm_double_cell (STRING_TAG
, SCM_UNPACK(buf
),
330 (scm_t_bits
)str_start
+ start
,
331 (scm_t_bits
) end
- start
);
335 scm_i_substring_read_only (SCM str
, size_t start
, size_t end
)
339 get_str_buf_start (&str
, &buf
, &str_start
);
340 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
341 SET_STRINGBUF_SHARED (buf
);
342 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
343 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK(buf
),
344 (scm_t_bits
)str_start
+ start
,
345 (scm_t_bits
) end
- start
);
349 scm_i_substring_copy (SCM str
, size_t start
, size_t end
)
351 size_t len
= end
- start
;
354 get_str_buf_start (&str
, &buf
, &str_start
);
355 if (scm_i_is_narrow_string (str
))
357 my_buf
= make_stringbuf (len
);
358 memcpy (STRINGBUF_CHARS (my_buf
),
359 STRINGBUF_CHARS (buf
) + str_start
+ start
, len
);
363 my_buf
= make_wide_stringbuf (len
);
364 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (my_buf
),
365 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
) + str_start
367 /* Even though this string is wide, the substring may be narrow.
368 Consider adding code to narrow the string. */
370 scm_remember_upto_here_1 (buf
);
371 return scm_double_cell (STRING_TAG
, SCM_UNPACK (my_buf
),
372 (scm_t_bits
) 0, (scm_t_bits
) len
);
376 scm_i_substring_shared (SCM str
, size_t start
, size_t end
)
378 if (start
== 0 && end
== STRING_LENGTH (str
))
382 size_t len
= end
- start
;
383 if (IS_SH_STRING (str
))
385 start
+= STRING_START (str
);
386 str
= SH_STRING_STRING (str
);
388 return scm_double_cell (SH_STRING_TAG
, SCM_UNPACK(str
),
389 (scm_t_bits
)start
, (scm_t_bits
) len
);
394 scm_c_substring (SCM str
, size_t start
, size_t end
)
396 validate_substring_args (str
, start
, end
);
397 return scm_i_substring (str
, start
, end
);
401 scm_c_substring_read_only (SCM str
, size_t start
, size_t end
)
403 validate_substring_args (str
, start
, end
);
404 return scm_i_substring_read_only (str
, start
, end
);
408 scm_c_substring_copy (SCM str
, size_t start
, size_t end
)
410 validate_substring_args (str
, start
, end
);
411 return scm_i_substring_copy (str
, start
, end
);
415 scm_c_substring_shared (SCM str
, size_t start
, size_t end
)
417 validate_substring_args (str
, start
, end
);
418 return scm_i_substring_shared (str
, start
, end
);
422 /* Internal accessors
425 /* Returns the number of characters in STR. This may be different
426 than the memory size of the string storage. */
428 scm_i_string_length (SCM str
)
430 return STRING_LENGTH (str
);
433 /* True if the string is 'narrow', meaning it has a 8-bit Latin-1
434 encoding. False if it is 'wide', having a 32-bit UCS-4
437 scm_i_is_narrow_string (SCM str
)
439 if (IS_SH_STRING (str
))
440 str
= SH_STRING_STRING (str
);
442 return !STRINGBUF_WIDE (STRING_STRINGBUF (str
));
445 /* Try to coerce a string to be narrow. It if is narrow already, do
446 nothing. If it is wide, shrink it to narrow if none of its
447 characters are above 0xFF. Return true if the string is narrow or
448 was made to be narrow. */
450 scm_i_try_narrow_string (SCM str
)
452 if (IS_SH_STRING (str
))
453 str
= SH_STRING_STRING (str
);
455 SET_STRING_STRINGBUF (str
, narrow_stringbuf (STRING_STRINGBUF (str
)));
457 return scm_i_is_narrow_string (str
);
460 /* Return a pointer to the raw data of the string, which can be either Latin-1
461 or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
463 scm_i_string_data (SCM str
)
469 get_str_buf_start (&str
, &buf
, &start
);
471 data
= STRINGBUF_CONTENTS (buf
);
472 data
+= start
* (scm_i_is_narrow_string (str
) ? 1 : 4);
477 /* Returns a pointer to the 8-bit Latin-1 encoded character array of
480 scm_i_string_chars (SCM str
)
484 get_str_buf_start (&str
, &buf
, &start
);
485 if (scm_i_is_narrow_string (str
))
486 return (const char *) STRINGBUF_CHARS (buf
) + start
;
488 scm_misc_error (NULL
, "Invalid read access of chars of wide string: ~s",
493 /* Returns a pointer to the 32-bit UCS-4 encoded character array of
496 scm_i_string_wide_chars (SCM str
)
501 get_str_buf_start (&str
, &buf
, &start
);
502 if (!scm_i_is_narrow_string (str
))
503 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
) + start
;
505 scm_misc_error (NULL
, "Invalid read access of chars of narrow string: ~s",
509 /* If the buffer in ORIG_STR is shared, copy ORIG_STR's characters to
510 a new string buffer, so that it can be modified without modifying
511 other strings. Also, lock the string mutex. Later, one must call
512 scm_i_string_stop_writing to unlock the mutex. */
514 scm_i_string_start_writing (SCM orig_str
)
516 SCM buf
, str
= orig_str
;
519 get_str_buf_start (&str
, &buf
, &start
);
520 if (IS_RO_STRING (str
))
521 scm_misc_error (NULL
, "string is read-only: ~s", scm_list_1 (orig_str
));
523 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
524 if (STRINGBUF_SHARED (buf
))
526 /* Clone the stringbuf. */
527 size_t len
= STRING_LENGTH (str
);
530 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
532 if (scm_i_is_narrow_string (str
))
534 new_buf
= make_stringbuf (len
);
535 memcpy (STRINGBUF_CHARS (new_buf
),
536 STRINGBUF_CHARS (buf
) + STRING_START (str
), len
);
541 new_buf
= make_wide_stringbuf (len
);
542 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
543 (scm_t_uint32
*) (STRINGBUF_WIDE_CHARS (buf
)
544 + STRING_START (str
)), len
);
547 SET_STRING_STRINGBUF (str
, new_buf
);
548 start
-= STRING_START (str
);
550 /* FIXME: The following operations are not atomic, so other threads
551 looking at STR may see an inconsistent state. Nevertheless it can't
552 hurt much since (i) accessing STR while it is being mutated can't
553 yield a crash, and (ii) concurrent accesses to STR should be
554 protected by a mutex at the application level. The latter may not
555 apply when STR != ORIG_STR, though. */
556 SET_STRING_START (str
, 0);
557 SET_STRING_STRINGBUF (str
, new_buf
);
561 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
566 /* Return a pointer to the 8-bit Latin-1 chars of a string. */
568 scm_i_string_writable_chars (SCM str
)
573 get_str_buf_start (&str
, &buf
, &start
);
574 if (scm_i_is_narrow_string (str
))
575 return (char *) STRINGBUF_CHARS (buf
) + start
;
577 scm_misc_error (NULL
, "Invalid write access of chars of wide string: ~s",
582 /* Return a pointer to the UCS-4 codepoints of a string. */
584 scm_i_string_writable_wide_chars (SCM str
)
589 get_str_buf_start (&str
, &buf
, &start
);
590 if (!scm_i_is_narrow_string (str
))
591 return STRINGBUF_WIDE_CHARS (buf
) + start
;
593 scm_misc_error (NULL
, "Invalid write access of chars of narrow string: ~s",
597 /* Unlock the string mutex that was locked when
598 scm_i_string_start_writing was called. */
600 scm_i_string_stop_writing (void)
602 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
605 /* Return the Xth character of STR as a UCS-4 codepoint. */
607 scm_i_string_ref (SCM str
, size_t x
)
609 if (scm_i_is_narrow_string (str
))
610 return (scm_t_wchar
) (unsigned char) (scm_i_string_chars (str
)[x
]);
612 return scm_i_string_wide_chars (str
)[x
];
615 /* Returns index+1 of the first char in STR that matches C, or
616 0 if the char is not found. */
618 scm_i_string_contains_char (SCM str
, char ch
)
621 size_t len
= scm_i_string_length (str
);
624 if (scm_i_is_narrow_string (str
))
628 if (scm_i_string_chars (str
)[i
] == ch
)
637 if (scm_i_string_wide_chars (str
)[i
]
638 == (unsigned char) ch
)
647 scm_i_string_strcmp (SCM sstr
, size_t start_x
, const char *cstr
)
649 if (scm_i_is_narrow_string (sstr
))
651 const char *a
= scm_i_string_chars (sstr
) + start_x
;
652 const char *b
= cstr
;
653 return strncmp (a
, b
, strlen(b
));
658 const scm_t_wchar
*a
= scm_i_string_wide_chars (sstr
) + start_x
;
659 const char *b
= cstr
;
660 for (i
= 0; i
< strlen (b
); i
++)
662 if (a
[i
] != (unsigned char) b
[i
])
669 /* Set the Pth character of STR to UCS-4 codepoint CHR. */
671 scm_i_string_set_x (SCM str
, size_t p
, scm_t_wchar chr
)
673 if (IS_SH_STRING (str
))
675 p
+= STRING_START (str
);
676 str
= SH_STRING_STRING (str
);
679 if (chr
> 0xFF && scm_i_is_narrow_string (str
))
680 SET_STRING_STRINGBUF (str
, wide_stringbuf (STRING_STRINGBUF (str
)));
682 if (scm_i_is_narrow_string (str
))
684 char *dst
= scm_i_string_writable_chars (str
);
689 scm_t_wchar
*dst
= scm_i_string_writable_wide_chars (str
);
697 Basic symbol creation and accessing is done here, the rest is in
698 symbols.[hc]. This has been done to keep stringbufs and the
699 internals of strings and string-like objects confined to this file.
702 #define SYMBOL_STRINGBUF SCM_CELL_OBJECT_1
705 scm_i_make_symbol (SCM name
, scm_t_bits flags
,
706 unsigned long hash
, SCM props
)
709 size_t start
= STRING_START (name
);
710 size_t length
= STRING_LENGTH (name
);
712 if (IS_SH_STRING (name
))
714 name
= SH_STRING_STRING (name
);
715 start
+= STRING_START (name
);
717 buf
= SYMBOL_STRINGBUF (name
);
719 if (start
== 0 && length
== STRINGBUF_LENGTH (buf
))
722 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
723 SET_STRINGBUF_SHARED (buf
);
724 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
729 if (scm_i_is_narrow_string (name
))
731 SCM new_buf
= make_stringbuf (length
);
732 memcpy (STRINGBUF_CHARS (new_buf
),
733 STRINGBUF_CHARS (buf
) + start
, length
);
738 SCM new_buf
= make_wide_stringbuf (length
);
739 u32_cpy ((scm_t_uint32
*) STRINGBUF_WIDE_CHARS (new_buf
),
740 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
) + start
,
745 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
746 (scm_t_bits
) hash
, SCM_UNPACK (props
));
750 scm_i_c_make_symbol (const char *name
, size_t len
,
751 scm_t_bits flags
, unsigned long hash
, SCM props
)
753 SCM buf
= make_stringbuf (len
);
754 memcpy (STRINGBUF_CHARS (buf
), name
, len
);
756 return scm_double_cell (scm_tc7_symbol
| flags
, SCM_UNPACK (buf
),
757 (scm_t_bits
) hash
, SCM_UNPACK (props
));
760 /* Returns the number of characters in SYM. This may be different
761 from the memory size of SYM. */
763 scm_i_symbol_length (SCM sym
)
765 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
769 scm_c_symbol_length (SCM sym
)
770 #define FUNC_NAME "scm_c_symbol_length"
772 SCM_VALIDATE_SYMBOL (1, sym
);
774 return STRINGBUF_LENGTH (SYMBOL_STRINGBUF (sym
));
778 /* True if the name of SYM is stored as a Latin-1 encoded string.
779 False if it is stored as a 32-bit UCS-4-encoded string. */
781 scm_i_is_narrow_symbol (SCM sym
)
785 buf
= SYMBOL_STRINGBUF (sym
);
786 return !STRINGBUF_WIDE (buf
);
789 /* Returns a pointer to the 8-bit Latin-1 encoded character array that
790 contains the name of SYM. */
792 scm_i_symbol_chars (SCM sym
)
796 buf
= SYMBOL_STRINGBUF (sym
);
797 if (!STRINGBUF_WIDE (buf
))
798 return (const char *) STRINGBUF_CHARS (buf
);
800 scm_misc_error (NULL
, "Invalid access of chars of a wide symbol ~S",
804 /* Return a pointer to the 32-bit UCS-4-encoded character array of a
807 scm_i_symbol_wide_chars (SCM sym
)
811 buf
= SYMBOL_STRINGBUF (sym
);
812 if (STRINGBUF_WIDE (buf
))
813 return (const scm_t_wchar
*) STRINGBUF_WIDE_CHARS (buf
);
815 scm_misc_error (NULL
, "Invalid access of chars of a narrow symbol ~S",
820 scm_i_symbol_substring (SCM sym
, size_t start
, size_t end
)
822 SCM buf
= SYMBOL_STRINGBUF (sym
);
823 scm_i_pthread_mutex_lock (&stringbuf_write_mutex
);
824 SET_STRINGBUF_SHARED (buf
);
825 scm_i_pthread_mutex_unlock (&stringbuf_write_mutex
);
826 return scm_double_cell (RO_STRING_TAG
, SCM_UNPACK (buf
),
827 (scm_t_bits
)start
, (scm_t_bits
) end
- start
);
830 /* Returns the Xth character of symbol SYM as a UCS-4 codepoint. */
832 scm_i_symbol_ref (SCM sym
, size_t x
)
834 if (scm_i_is_narrow_symbol (sym
))
835 return (scm_t_wchar
) (unsigned char) (scm_i_symbol_chars (sym
)[x
]);
837 return scm_i_symbol_wide_chars (sym
)[x
];
843 SCM_DEFINE (scm_sys_string_dump
, "%string-dump", 1, 0, 0, (SCM str
),
844 "Returns an association list containing debugging information\n"
845 "for @var{str}. The association list has the following entries."
848 "The string itself.\n"
850 "The start index of the string into its stringbuf\n"
852 "The length of the string\n"
854 "If this string is a substring, it returns its parent string.\n"
855 "Otherwise, it returns @code{#f}\n"
857 "@code{#t} if the string is read-only\n"
858 "@item stringbuf-chars\n"
859 "A new string containing this string's stringbuf's characters\n"
860 "@item stringbuf-length\n"
861 "The number of characters in this stringbuf\n"
862 "@item stringbuf-shared\n"
863 "@code{#t} if this stringbuf is shared\n"
864 "@item stringbuf-wide\n"
865 "@code{#t} if this stringbuf's characters are stored in a\n"
866 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
869 #define FUNC_NAME s_scm_sys_string_dump
871 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
;
873 SCM_VALIDATE_STRING (1, str
);
876 e1
= scm_cons (scm_from_latin1_symbol ("string"),
878 e2
= scm_cons (scm_from_latin1_symbol ("start"),
879 scm_from_size_t (STRING_START (str
)));
880 e3
= scm_cons (scm_from_latin1_symbol ("length"),
881 scm_from_size_t (STRING_LENGTH (str
)));
883 if (IS_SH_STRING (str
))
885 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
886 SH_STRING_STRING (str
));
887 buf
= STRING_STRINGBUF (SH_STRING_STRING (str
));
891 e4
= scm_cons (scm_from_latin1_symbol ("shared"),
893 buf
= STRING_STRINGBUF (str
);
896 if (IS_RO_STRING (str
))
897 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
900 e5
= scm_cons (scm_from_latin1_symbol ("read-only"),
904 if (!STRINGBUF_WIDE (buf
))
906 size_t len
= STRINGBUF_LENGTH (buf
);
908 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
909 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
910 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
915 size_t len
= STRINGBUF_LENGTH (buf
);
917 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
918 u32_cpy ((scm_t_uint32
*) cbuf
,
919 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
920 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
923 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
924 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
925 if (STRINGBUF_SHARED (buf
))
926 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
929 e8
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
931 if (STRINGBUF_WIDE (buf
))
932 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
935 e9
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
938 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, e8
, e9
, SCM_UNDEFINED
);
942 SCM_DEFINE (scm_sys_symbol_dump
, "%symbol-dump", 1, 0, 0, (SCM sym
),
943 "Returns an association list containing debugging information\n"
944 "for @var{sym}. The association list has the following entries."
947 "The symbol itself\n"
951 "@code{#t} if it is an interned symbol\n"
952 "@item stringbuf-chars\n"
953 "A new string containing this symbols's stringbuf's characters\n"
954 "@item stringbuf-length\n"
955 "The number of characters in this stringbuf\n"
956 "@item stringbuf-shared\n"
957 "@code{#t} if this stringbuf is shared\n"
958 "@item stringbuf-wide\n"
959 "@code{#t} if this stringbuf's characters are stored in a\n"
960 "32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
963 #define FUNC_NAME s_scm_sys_symbol_dump
965 SCM e1
, e2
, e3
, e4
, e5
, e6
, e7
;
967 SCM_VALIDATE_SYMBOL (1, sym
);
968 e1
= scm_cons (scm_from_latin1_symbol ("symbol"),
970 e2
= scm_cons (scm_from_latin1_symbol ("hash"),
971 scm_from_ulong (scm_i_symbol_hash (sym
)));
972 e3
= scm_cons (scm_from_latin1_symbol ("interned"),
973 scm_symbol_interned_p (sym
));
974 buf
= SYMBOL_STRINGBUF (sym
);
977 if (!STRINGBUF_WIDE (buf
))
979 size_t len
= STRINGBUF_LENGTH (buf
);
981 SCM sbc
= scm_i_make_string (len
, &cbuf
, 0);
982 memcpy (cbuf
, STRINGBUF_CHARS (buf
), len
);
983 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
988 size_t len
= STRINGBUF_LENGTH (buf
);
990 SCM sbc
= scm_i_make_wide_string (len
, &cbuf
, 0);
991 u32_cpy ((scm_t_uint32
*) cbuf
,
992 (scm_t_uint32
*) STRINGBUF_WIDE_CHARS (buf
), len
);
993 e4
= scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
996 e5
= scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
997 scm_from_size_t (STRINGBUF_LENGTH (buf
)));
998 if (STRINGBUF_SHARED (buf
))
999 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1002 e6
= scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
1004 if (STRINGBUF_WIDE (buf
))
1005 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1008 e7
= scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
1010 return scm_list_n (e1
, e2
, e3
, e4
, e5
, e6
, e7
, SCM_UNDEFINED
);
1015 #ifdef SCM_STRING_LENGTH_HISTOGRAM
1017 SCM_DEFINE (scm_sys_stringbuf_hist
, "%stringbuf-hist", 0, 0, 0, (void), "")
1018 #define FUNC_NAME s_scm_sys_stringbuf_hist
1021 for (i
= 0; i
< 1000; i
++)
1023 fprintf (stderr
, " %3d: %u\n", i
, lenhist
[i
]);
1024 fprintf (stderr
, ">999: %u\n", lenhist
[1000]);
1025 return SCM_UNSPECIFIED
;
1033 SCM_DEFINE (scm_string_p
, "string?", 1, 0, 0,
1035 "Return @code{#t} if @var{obj} is a string, else @code{#f}.")
1036 #define FUNC_NAME s_scm_string_p
1038 return scm_from_bool (IS_STRING (obj
));
1043 SCM_REGISTER_PROC (s_scm_list_to_string
, "list->string", 1, 0, 0, scm_string
);
1045 SCM_DEFINE (scm_string
, "string", 0, 0, 1,
1047 "@deffnx {Scheme Procedure} list->string chrs\n"
1048 "Return a newly allocated string composed of the arguments,\n"
1050 #define FUNC_NAME s_scm_string
1052 SCM result
= SCM_BOOL_F
;
1059 /* Verify that this is a list of chars. */
1060 i
= scm_ilength (chrs
);
1061 SCM_ASSERT (i
>= 0, chrs
, SCM_ARG1
, FUNC_NAME
);
1066 while (len
> 0 && scm_is_pair (rest
))
1068 SCM elt
= SCM_CAR (rest
);
1069 SCM_VALIDATE_CHAR (SCM_ARGn
, elt
);
1070 if (SCM_CHAR (elt
) > 0xFF)
1072 rest
= SCM_CDR (rest
);
1074 scm_remember_upto_here_1 (elt
);
1077 /* Construct a string containing this list of chars. */
1085 result
= scm_i_make_string (len
, NULL
, 0);
1086 result
= scm_i_string_start_writing (result
);
1087 buf
= scm_i_string_writable_chars (result
);
1088 while (len
> 0 && scm_is_pair (rest
))
1090 SCM elt
= SCM_CAR (rest
);
1091 buf
[p
] = (unsigned char) SCM_CHAR (elt
);
1093 rest
= SCM_CDR (rest
);
1095 scm_remember_upto_here_1 (elt
);
1102 result
= scm_i_make_wide_string (len
, NULL
, 0);
1103 result
= scm_i_string_start_writing (result
);
1104 buf
= scm_i_string_writable_wide_chars (result
);
1105 while (len
> 0 && scm_is_pair (rest
))
1107 SCM elt
= SCM_CAR (rest
);
1108 buf
[p
] = SCM_CHAR (elt
);
1110 rest
= SCM_CDR (rest
);
1112 scm_remember_upto_here_1 (elt
);
1115 scm_i_string_stop_writing ();
1118 scm_misc_error (NULL
, "list changed while constructing string", SCM_EOL
);
1119 if (!scm_is_null (rest
))
1120 scm_wrong_type_arg_msg (NULL
, 0, chrs
, "proper list");
1126 SCM_DEFINE (scm_make_string
, "make-string", 1, 1, 0,
1128 "Return a newly allocated string of\n"
1129 "length @var{k}. If @var{chr} is given, then all elements of\n"
1130 "the string are initialized to @var{chr}, otherwise the contents\n"
1131 "of the @var{string} are all set to @var{#\nul}.")
1132 #define FUNC_NAME s_scm_make_string
1134 return scm_c_make_string (scm_to_size_t (k
), chr
);
1139 scm_c_make_string (size_t len
, SCM chr
)
1140 #define FUNC_NAME NULL
1143 char *contents
= NULL
;
1144 SCM res
= scm_i_make_string (len
, &contents
, 0);
1146 /* If no char is given, initialize string contents to NULL. */
1147 if (SCM_UNBNDP (chr
))
1148 memset (contents
, 0, len
);
1151 SCM_VALIDATE_CHAR (0, chr
);
1152 res
= scm_i_string_start_writing (res
);
1153 for (p
= 0; p
< len
; p
++)
1154 scm_i_string_set_x (res
, p
, SCM_CHAR (chr
));
1155 scm_i_string_stop_writing ();
1162 SCM_DEFINE (scm_string_length
, "string-length", 1, 0, 0,
1164 "Return the number of characters in @var{string}.")
1165 #define FUNC_NAME s_scm_string_length
1167 SCM_VALIDATE_STRING (1, string
);
1168 return scm_from_size_t (STRING_LENGTH (string
));
1172 SCM_DEFINE (scm_string_bytes_per_char
, "string-bytes-per-char", 1, 0, 0,
1174 "Return the bytes used to represent a character in @var{string}."
1175 "This will return 1 or 4.")
1176 #define FUNC_NAME s_scm_string_bytes_per_char
1178 SCM_VALIDATE_STRING (1, string
);
1179 if (!scm_i_is_narrow_string (string
))
1180 return scm_from_int (4);
1182 return scm_from_int (1);
1187 scm_c_string_length (SCM string
)
1189 if (!IS_STRING (string
))
1190 scm_wrong_type_arg_msg (NULL
, 0, string
, "string");
1191 return STRING_LENGTH (string
);
1194 SCM_DEFINE (scm_string_ref
, "string-ref", 2, 0, 0,
1196 "Return character @var{k} of @var{str} using zero-origin\n"
1197 "indexing. @var{k} must be a valid index of @var{str}.")
1198 #define FUNC_NAME s_scm_string_ref
1203 SCM_VALIDATE_STRING (1, str
);
1205 len
= scm_i_string_length (str
);
1206 if (SCM_LIKELY (len
> 0))
1207 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1209 scm_out_of_range (NULL
, k
);
1211 if (scm_i_is_narrow_string (str
))
1212 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[idx
]);
1214 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[idx
]);
1219 scm_c_string_ref (SCM str
, size_t p
)
1221 if (p
>= scm_i_string_length (str
))
1222 scm_out_of_range (NULL
, scm_from_size_t (p
));
1223 if (scm_i_is_narrow_string (str
))
1224 return SCM_MAKE_CHAR (scm_i_string_chars (str
)[p
]);
1226 return SCM_MAKE_CHAR (scm_i_string_wide_chars (str
)[p
]);
1230 SCM_DEFINE (scm_string_set_x
, "string-set!", 3, 0, 0,
1231 (SCM str
, SCM k
, SCM chr
),
1232 "Store @var{chr} in element @var{k} of @var{str} and return\n"
1233 "an unspecified value. @var{k} must be a valid index of\n"
1235 #define FUNC_NAME s_scm_string_set_x
1240 SCM_VALIDATE_STRING (1, str
);
1242 len
= scm_i_string_length (str
);
1243 if (SCM_LIKELY (len
> 0))
1244 idx
= scm_to_unsigned_integer (k
, 0, len
- 1);
1246 scm_out_of_range (NULL
, k
);
1248 SCM_VALIDATE_CHAR (3, chr
);
1249 str
= scm_i_string_start_writing (str
);
1250 scm_i_string_set_x (str
, idx
, SCM_CHAR (chr
));
1251 scm_i_string_stop_writing ();
1253 return SCM_UNSPECIFIED
;
1258 scm_c_string_set_x (SCM str
, size_t p
, SCM chr
)
1260 if (p
>= scm_i_string_length (str
))
1261 scm_out_of_range (NULL
, scm_from_size_t (p
));
1262 str
= scm_i_string_start_writing (str
);
1263 scm_i_string_set_x (str
, p
, SCM_CHAR (chr
));
1264 scm_i_string_stop_writing ();
1267 SCM_DEFINE (scm_substring
, "substring", 2, 1, 0,
1268 (SCM str
, SCM start
, SCM end
),
1269 "Return a newly allocated string formed from the characters\n"
1270 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1271 "ending with index @var{end} (exclusive).\n"
1272 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1273 "exact integers satisfying:\n\n"
1274 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1275 #define FUNC_NAME s_scm_substring
1277 size_t len
, from
, to
;
1279 SCM_VALIDATE_STRING (1, str
);
1280 len
= scm_i_string_length (str
);
1281 from
= scm_to_unsigned_integer (start
, 0, len
);
1282 if (SCM_UNBNDP (end
))
1285 to
= scm_to_unsigned_integer (end
, from
, len
);
1286 return scm_i_substring (str
, from
, to
);
1290 SCM_DEFINE (scm_substring_read_only
, "substring/read-only", 2, 1, 0,
1291 (SCM str
, SCM start
, SCM end
),
1292 "Return a newly allocated string formed from the characters\n"
1293 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1294 "ending with index @var{end} (exclusive).\n"
1295 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1296 "exact integers satisfying:\n"
1298 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).\n"
1300 "The returned string is read-only.\n")
1301 #define FUNC_NAME s_scm_substring_read_only
1303 size_t len
, from
, to
;
1305 SCM_VALIDATE_STRING (1, str
);
1306 len
= scm_i_string_length (str
);
1307 from
= scm_to_unsigned_integer (start
, 0, len
);
1308 if (SCM_UNBNDP (end
))
1311 to
= scm_to_unsigned_integer (end
, from
, len
);
1312 return scm_i_substring_read_only (str
, from
, to
);
1316 SCM_DEFINE (scm_substring_copy
, "substring/copy", 2, 1, 0,
1317 (SCM str
, SCM start
, SCM end
),
1318 "Return a newly allocated string formed from the characters\n"
1319 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1320 "ending with index @var{end} (exclusive).\n"
1321 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1322 "exact integers satisfying:\n\n"
1323 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1324 #define FUNC_NAME s_scm_substring_copy
1326 /* For the Scheme version, START is mandatory, but for the C
1327 version, it is optional. See scm_string_copy in srfi-13.c for a
1333 SCM_VALIDATE_STRING (1, str
);
1334 scm_i_get_substring_spec (scm_i_string_length (str
),
1335 start
, &from
, end
, &to
);
1336 return scm_i_substring_copy (str
, from
, to
);
1340 SCM_DEFINE (scm_substring_shared
, "substring/shared", 2, 1, 0,
1341 (SCM str
, SCM start
, SCM end
),
1342 "Return string that indirectly refers to the characters\n"
1343 "of @var{str} beginning with index @var{start} (inclusive) and\n"
1344 "ending with index @var{end} (exclusive).\n"
1345 "@var{str} must be a string, @var{start} and @var{end} must be\n"
1346 "exact integers satisfying:\n\n"
1347 "0 <= @var{start} <= @var{end} <= (string-length @var{str}).")
1348 #define FUNC_NAME s_scm_substring_shared
1350 size_t len
, from
, to
;
1352 SCM_VALIDATE_STRING (1, str
);
1353 len
= scm_i_string_length (str
);
1354 from
= scm_to_unsigned_integer (start
, 0, len
);
1355 if (SCM_UNBNDP (end
))
1358 to
= scm_to_unsigned_integer (end
, from
, len
);
1359 return scm_i_substring_shared (str
, from
, to
);
1363 SCM_DEFINE (scm_string_append
, "string-append", 0, 0, 1,
1365 "Return a newly allocated string whose characters form the\n"
1366 "concatenation of the given strings, @var{args}.")
1367 #define FUNC_NAME s_scm_string_append
1380 SCM_VALIDATE_REST_ARGUMENT (args
);
1381 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1384 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1385 len
+= scm_i_string_length (s
);
1386 if (!scm_i_is_narrow_string (s
))
1391 res
= scm_i_make_string (len
, &data
.narrow
, 0);
1393 res
= scm_i_make_wide_string (len
, &data
.wide
, 0);
1395 for (l
= args
; !scm_is_null (l
); l
= SCM_CDR (l
))
1399 SCM_VALIDATE_STRING (SCM_ARGn
, s
);
1400 len
= scm_i_string_length (s
);
1403 memcpy (data
.narrow
, scm_i_string_chars (s
), len
);
1408 if (scm_i_is_narrow_string (s
))
1410 for (i
= 0; i
< scm_i_string_length (s
); i
++)
1411 data
.wide
[i
] = (unsigned char) scm_i_string_chars (s
)[i
];
1414 u32_cpy ((scm_t_uint32
*) data
.wide
,
1415 (scm_t_uint32
*) scm_i_string_wide_chars (s
), len
);
1418 scm_remember_upto_here_1 (s
);
1426 /* Charset conversion error handling. */
1428 SCM_SYMBOL (scm_encoding_error_key
, "encoding-error");
1429 SCM_SYMBOL (scm_decoding_error_key
, "decoding-error");
1431 /* Raise an exception informing that character CHR could not be written
1432 to PORT in its current encoding. */
1434 scm_encoding_error (const char *subr
, int err
, const char *message
,
1437 scm_throw (scm_encoding_error_key
,
1438 scm_list_n (scm_from_latin1_string (subr
),
1439 scm_from_latin1_string (message
),
1445 /* Raise an exception informing of an encoding error on PORT. This
1446 means that a character could not be written in PORT's encoding. */
1448 scm_decoding_error (const char *subr
, int err
, const char *message
, SCM port
)
1450 scm_throw (scm_decoding_error_key
,
1451 scm_list_n (scm_from_latin1_string (subr
),
1452 scm_from_latin1_string (message
),
1459 /* String conversion to/from C. */
1462 decoding_error (const char *func_name
, int errno_save
,
1463 const char *str
, size_t len
)
1465 /* Raise an error and pass the raw C string as a bytevector to the `throw'
1470 buf
= scm_gc_malloc_pointerless (len
, "bytevector");
1471 memcpy (buf
, str
, len
);
1472 bv
= scm_c_take_gc_bytevector (buf
, len
, SCM_BOOL_F
);
1474 scm_decoding_error (func_name
, errno_save
,
1475 "input locale conversion error", bv
);
1479 scm_from_stringn (const char *str
, size_t len
, const char *encoding
,
1480 scm_t_string_failed_conversion_handler handler
)
1487 /* The order of these checks is important. */
1488 if (!str
&& len
!= 0)
1489 scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL
);
1490 if (len
== (size_t) -1)
1495 if (encoding
== NULL
)
1497 /* If encoding is null, use Latin-1. */
1499 res
= scm_i_make_string (len
, &buf
, 0);
1500 memcpy (buf
, str
, len
);
1505 u32
= (scm_t_wchar
*) u32_conv_from_encoding (encoding
,
1506 (enum iconv_ilseq_handler
)
1512 if (SCM_UNLIKELY (u32
== NULL
))
1513 decoding_error (__func__
, errno
, str
, len
);
1517 if (u32
[i
++] > 0xFF)
1526 res
= scm_i_make_string (u32len
, &dst
, 0);
1527 for (i
= 0; i
< u32len
; i
++)
1528 dst
[i
] = (unsigned char) u32
[i
];
1534 res
= scm_i_make_wide_string (u32len
, &wdst
, 0);
1535 u32_cpy ((scm_t_uint32
*) wdst
, (scm_t_uint32
*) u32
, u32len
);
1544 scm_from_locale_string (const char *str
)
1546 return scm_from_locale_stringn (str
, -1);
1550 scm_from_locale_stringn (const char *str
, size_t len
)
1552 return scm_from_stringn (str
, len
, locale_charset (),
1553 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1557 scm_from_latin1_string (const char *str
)
1559 return scm_from_latin1_stringn (str
, -1);
1563 scm_from_latin1_stringn (const char *str
, size_t len
)
1568 if (len
== (size_t) -1)
1571 /* Make a narrow string and copy STR as is. */
1572 result
= scm_i_make_string (len
, &buf
, 0);
1573 memcpy (buf
, str
, len
);
1579 scm_from_utf8_string (const char *str
)
1581 return scm_from_utf8_stringn (str
, -1);
1585 scm_from_utf8_stringn (const char *str
, size_t len
)
1588 const scm_t_uint8
*ustr
= (const scm_t_uint8
*) str
;
1589 int ascii
= 1, narrow
= 1;
1592 if (len
== (size_t) -1)
1612 nbytes
= u8_mbtouc (&c
, ustr
+ i
, len
- i
);
1616 decoding_error (__func__
, errno
, str
, len
);
1629 res
= scm_i_make_string (char_len
, &dst
, 0);
1630 memcpy (dst
, str
, len
);
1638 res
= scm_i_make_string (char_len
, &dst
, 0);
1640 for (i
= 0, j
= 0; i
< len
; i
++, j
++)
1642 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1643 dst
[j
] = (signed char) c
;
1652 res
= scm_i_make_wide_string (char_len
, &dst
, 0);
1654 for (i
= 0, j
= 0; i
< len
; i
++, j
++)
1656 i
+= u8_mbtouc_unsafe (&c
, ustr
+ i
, len
- i
);
1665 scm_from_utf32_string (const scm_t_wchar
*str
)
1667 return scm_from_utf32_stringn (str
, -1);
1671 scm_from_utf32_stringn (const scm_t_wchar
*str
, size_t len
)
1676 if (len
== (size_t) -1)
1677 len
= u32_strlen ((uint32_t *) str
);
1679 result
= scm_i_make_wide_string (len
, &buf
, 0);
1680 memcpy (buf
, str
, len
* sizeof (scm_t_wchar
));
1681 scm_i_try_narrow_string (result
);
1686 /* Create a new scheme string from the C string STR. The memory of
1687 STR may be used directly as storage for the new string. */
1688 /* FIXME: GC-wise, the only way to use the memory area pointed to by STR
1689 would be to register a finalizer to eventually free(3) STR, which isn't
1690 worth it. Should we just deprecate the `scm_take_' functions? */
1692 scm_take_locale_stringn (char *str
, size_t len
)
1696 res
= scm_from_locale_stringn (str
, len
);
1703 scm_take_locale_string (char *str
)
1705 return scm_take_locale_stringn (str
, -1);
1708 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1709 *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
1710 Set *LENP to the size of the resulting string.
1712 FIXME: This is a hack we should get rid of. See
1713 <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
1716 unistring_escapes_to_guile_escapes (char *buf
, size_t *lenp
)
1718 char *before
, *after
;
1727 if ((i
<= *lenp
- 6)
1728 && before
[i
] == '\\'
1729 && before
[i
+ 1] == 'u'
1730 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1732 /* Convert \u00NN to \xNN */
1735 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1736 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1740 else if ((i
<= *lenp
- 10)
1741 && before
[i
] == '\\'
1742 && before
[i
+ 1] == 'U'
1743 && before
[i
+ 2] == '0' && before
[i
+ 3] == '0')
1745 /* Convert \U00NNNNNN to \UNNNNNN */
1748 after
[j
+ 2] = tolower ((int) before
[i
+ 4]);
1749 after
[j
+ 3] = tolower ((int) before
[i
+ 5]);
1750 after
[j
+ 4] = tolower ((int) before
[i
+ 6]);
1751 after
[j
+ 5] = tolower ((int) before
[i
+ 7]);
1752 after
[j
+ 6] = tolower ((int) before
[i
+ 8]);
1753 after
[j
+ 7] = tolower ((int) before
[i
+ 9]);
1759 after
[j
] = before
[i
];
1767 /* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
1768 *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
1769 of the resulting string. BUF must be large enough to handle the
1770 worst case when `\uXXXX' escapes (6 characters) are replaced by
1771 `\xXXXX;' (7 characters). */
1773 unistring_escapes_to_r6rs_escapes (char *buf
, size_t *lenp
)
1775 char *before
, *after
;
1777 /* The worst case is if the input string contains all 4-digit hex escapes.
1778 "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
1779 size_t max_out_len
= (*lenp
* 7) / 6 + 1;
1780 size_t nzeros
, ndigits
;
1783 after
= alloca (max_out_len
);
1788 if (((i
<= *lenp
- 6) && before
[i
] == '\\' && before
[i
+ 1] == 'u')
1789 || ((i
<= *lenp
- 10) && before
[i
] == '\\' && before
[i
+ 1] == 'U'))
1791 if (before
[i
+ 1] == 'u')
1793 else if (before
[i
+ 1] == 'U')
1798 /* Add the R6RS hex escape initial sequence. */
1802 /* Move string positions to the start of the hex numbers. */
1806 /* Find the number of initial zeros in this hex number. */
1808 while (before
[i
+ nzeros
] == '0' && nzeros
< ndigits
)
1811 /* Copy the number, skipping initial zeros, and then move the string
1813 if (nzeros
== ndigits
)
1822 for (pos
= 0; pos
< ndigits
- nzeros
; pos
++)
1823 after
[j
+ pos
] = tolower ((int) before
[i
+ nzeros
+ pos
]);
1825 j
+= (ndigits
- nzeros
);
1828 /* Add terminating semicolon. */
1834 after
[j
] = before
[i
];
1840 memcpy (before
, after
, j
);
1844 scm_to_locale_string (SCM str
)
1846 return scm_to_locale_stringn (str
, NULL
);
1850 scm_to_locale_stringn (SCM str
, size_t *lenp
)
1852 return scm_to_stringn (str
, lenp
,
1854 scm_i_get_conversion_strategy (SCM_BOOL_F
));
1858 scm_to_latin1_string (SCM str
)
1860 return scm_to_latin1_stringn (str
, NULL
);
1864 scm_to_latin1_stringn (SCM str
, size_t *lenp
)
1865 #define FUNC_NAME "scm_to_latin1_stringn"
1869 SCM_VALIDATE_STRING (1, str
);
1871 if (scm_i_is_narrow_string (str
))
1873 size_t len
= scm_i_string_length (str
);
1878 result
= scm_strndup (scm_i_string_data (str
), len
);
1881 result
= scm_to_stringn (str
, lenp
, NULL
,
1882 SCM_FAILED_CONVERSION_ERROR
);
1889 scm_to_utf8_string (SCM str
)
1891 return scm_to_utf8_stringn (str
, NULL
);
1895 scm_to_utf8_stringn (SCM str
, size_t *lenp
)
1897 return scm_to_stringn (str
, lenp
, "UTF-8", SCM_FAILED_CONVERSION_ERROR
);
1901 scm_to_utf32_string (SCM str
)
1903 return scm_to_utf32_stringn (str
, NULL
);
1907 scm_to_utf32_stringn (SCM str
, size_t *lenp
)
1908 #define FUNC_NAME "scm_to_utf32_stringn"
1910 scm_t_wchar
*result
;
1912 SCM_VALIDATE_STRING (1, str
);
1914 if (scm_i_is_narrow_string (str
))
1915 result
= (scm_t_wchar
*)
1916 scm_to_stringn (str
, lenp
, "UTF-32",
1917 SCM_FAILED_CONVERSION_ERROR
);
1922 len
= scm_i_string_length (str
);
1926 result
= scm_malloc ((len
+ 1) * sizeof (scm_t_wchar
));
1927 memcpy (result
, scm_i_string_wide_chars (str
),
1928 len
* sizeof (scm_t_wchar
));
1936 /* Return a malloc(3)-allocated buffer containing the contents of STR encoded
1937 according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
1938 the returned buffer. If the conversion to ENCODING fails, apply the strategy
1939 defined by HANDLER. */
1941 scm_to_stringn (SCM str
, size_t *lenp
, const char *encoding
,
1942 scm_t_string_failed_conversion_handler handler
)
1945 size_t ilen
, len
, i
;
1949 if (!scm_is_string (str
))
1950 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
1951 ilen
= scm_i_string_length (str
);
1955 buf
= scm_malloc (1);
1963 for (i
= 0; i
< ilen
; i
++)
1964 if (scm_i_string_ref (str
, i
) == '\0')
1965 scm_misc_error (NULL
,
1966 "string contains #\\nul character: ~S",
1969 if (scm_i_is_narrow_string (str
) && (encoding
== NULL
))
1971 /* If using native Latin-1 encoding, just copy the string
1975 buf
= scm_malloc (ilen
);
1976 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1982 buf
= scm_malloc (ilen
+ 1);
1983 memcpy (buf
, scm_i_string_chars (str
), ilen
);
1995 if (scm_i_is_narrow_string (str
))
1997 ret
= mem_iconveh (scm_i_string_chars (str
), ilen
,
1999 (enum iconv_ilseq_handler
) handler
, NULL
,
2003 scm_encoding_error (__func__
, errno
,
2004 "cannot convert narrow string to output locale",
2006 /* FIXME: Faulty character unknown. */
2011 buf
= u32_conv_to_encoding (enc
,
2012 (enum iconv_ilseq_handler
) handler
,
2013 (scm_t_uint32
*) scm_i_string_wide_chars (str
),
2018 scm_encoding_error (__func__
, errno
,
2019 "cannot convert wide string to output locale",
2021 /* FIXME: Faulty character unknown. */
2024 if (handler
== SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE
)
2026 if (SCM_R6RS_ESCAPES_P
)
2028 /* The worst case is if the input string contains all 4-digit
2029 hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
2030 (seven characters). Make BUF large enough to hold
2032 buf
= scm_realloc (buf
, (len
* 7) / 6 + 1);
2033 unistring_escapes_to_r6rs_escapes (buf
, &len
);
2036 unistring_escapes_to_guile_escapes (buf
, &len
);
2038 buf
= scm_realloc (buf
, len
);
2044 buf
= scm_realloc (buf
, len
+ 1);
2048 scm_remember_upto_here_1 (str
);
2053 scm_to_locale_stringbuf (SCM str
, char *buf
, size_t max_len
)
2056 char *result
= NULL
;
2057 if (!scm_is_string (str
))
2058 scm_wrong_type_arg_msg (NULL
, 0, str
, "string");
2059 result
= scm_to_locale_stringn (str
, &len
);
2061 memcpy (buf
, result
, (len
> max_len
) ? max_len
: len
);
2064 scm_remember_upto_here_1 (str
);
2069 /* Unicode string normalization. */
2071 /* This function is a partial clone of SCM_STRING_TO_U32_BUF from
2072 libguile/i18n.c. It would be useful to have this factored out into a more
2073 convenient location, but its use of alloca makes that tricky to do. */
2076 normalize_str (SCM string
, uninorm_t form
)
2079 scm_t_uint32
*w_str
;
2081 size_t rlen
, len
= scm_i_string_length (string
);
2083 if (scm_i_is_narrow_string (string
))
2086 const char *buf
= scm_i_string_chars (string
);
2088 w_str
= alloca (sizeof (scm_t_wchar
) * (len
+ 1));
2090 for (i
= 0; i
< len
; i
++)
2091 w_str
[i
] = (unsigned char) buf
[i
];
2095 w_str
= (scm_t_uint32
*) scm_i_string_wide_chars (string
);
2097 w_str
= u32_normalize (form
, w_str
, len
, NULL
, &rlen
);
2099 ret
= scm_i_make_wide_string (rlen
, &cbuf
, 0);
2100 u32_cpy ((scm_t_uint32
*) cbuf
, w_str
, rlen
);
2103 scm_i_try_narrow_string (ret
);
2108 SCM_DEFINE (scm_string_normalize_nfc
, "string-normalize-nfc", 1, 0, 0,
2110 "Returns the NFC normalized form of @var{string}.")
2111 #define FUNC_NAME s_scm_string_normalize_nfc
2113 SCM_VALIDATE_STRING (1, string
);
2114 return normalize_str (string
, UNINORM_NFC
);
2118 SCM_DEFINE (scm_string_normalize_nfd
, "string-normalize-nfd", 1, 0, 0,
2120 "Returns the NFD normalized form of @var{string}.")
2121 #define FUNC_NAME s_scm_string_normalize_nfd
2123 SCM_VALIDATE_STRING (1, string
);
2124 return normalize_str (string
, UNINORM_NFD
);
2128 SCM_DEFINE (scm_string_normalize_nfkc
, "string-normalize-nfkc", 1, 0, 0,
2130 "Returns the NFKC normalized form of @var{string}.")
2131 #define FUNC_NAME s_scm_string_normalize_nfkc
2133 SCM_VALIDATE_STRING (1, string
);
2134 return normalize_str (string
, UNINORM_NFKC
);
2138 SCM_DEFINE (scm_string_normalize_nfkd
, "string-normalize-nfkd", 1, 0, 0,
2140 "Returns the NFKD normalized form of @var{string}.")
2141 #define FUNC_NAME s_scm_string_normalize_nfkd
2143 SCM_VALIDATE_STRING (1, string
);
2144 return normalize_str (string
, UNINORM_NFKD
);
2148 /* converts C scm_array of strings to SCM scm_list of strings.
2149 If argc < 0, a null terminated scm_array is assumed.
2150 The current locale encoding is assumed */
2152 scm_makfromstrs (int argc
, char **argv
)
2157 for (i
= 0; argv
[i
]; i
++);
2159 lst
= scm_cons (scm_from_locale_string (argv
[i
]), lst
);
2163 /* Return a newly allocated array of char pointers to each of the strings
2164 in args, with a terminating NULL pointer. The strings are encoded using
2165 the current locale. */
2168 scm_i_allocate_string_pointers (SCM list
)
2169 #define FUNC_NAME "scm_i_allocate_string_pointers"
2172 int list_len
= scm_ilength (list
);
2176 scm_wrong_type_arg_msg (NULL
, 0, list
, "proper list");
2178 result
= scm_gc_malloc ((list_len
+ 1) * sizeof (char *),
2180 result
[list_len
] = NULL
;
2182 /* The list might have been modified in another thread, so
2183 we check LIST before each access.
2185 for (i
= 0; i
< list_len
&& scm_is_pair (list
); i
++)
2187 SCM str
= SCM_CAR (list
);
2188 size_t len
; /* String length in bytes */
2189 char *c_str
= scm_to_locale_stringn (str
, &len
);
2191 /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
2192 scm_malloc to allocate the returned string, which must be
2193 explicitly deallocated. This forces us to copy the string a
2194 second time into a new buffer. Ideally there would be variants
2195 of scm_to_*_stringn that can return garbage-collected buffers. */
2197 result
[i
] = scm_gc_malloc_pointerless (len
+ 1, "string");
2198 memcpy (result
[i
], c_str
, len
);
2199 result
[i
][len
] = '\0';
2202 list
= SCM_CDR (list
);
2210 scm_i_get_substring_spec (size_t len
,
2211 SCM start
, size_t *cstart
,
2212 SCM end
, size_t *cend
)
2214 if (SCM_UNBNDP (start
))
2217 *cstart
= scm_to_unsigned_integer (start
, 0, len
);
2219 if (SCM_UNBNDP (end
))
2222 *cend
= scm_to_unsigned_integer (end
, *cstart
, len
);
2226 string_handle_ref (scm_t_array_handle
*h
, size_t index
)
2228 return scm_c_string_ref (h
->array
, index
);
2232 string_handle_set (scm_t_array_handle
*h
, size_t index
, SCM val
)
2234 scm_c_string_set_x (h
->array
, index
, val
);
2238 string_get_handle (SCM v
, scm_t_array_handle
*h
)
2244 h
->dim0
.ubnd
= scm_c_string_length (v
) - 1;
2246 h
->element_type
= SCM_ARRAY_ELEMENT_TYPE_CHAR
;
2247 h
->elements
= h
->writable_elements
= NULL
;
2250 SCM_ARRAY_IMPLEMENTATION (scm_tc7_string
, 0x7f,
2251 string_handle_ref
, string_handle_set
,
2253 SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR
, scm_make_string
)
2258 scm_nullstr
= scm_i_make_string (0, NULL
, 0);
2260 #include "libguile/strings.x"