-/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
lenhist[1000]++;
#endif
- buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
+ buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
"string"));
SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
#endif
raw_len = (len + 1) * sizeof (scm_t_wchar);
- buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
+ buf = SCM_PACK_POINTER (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
"string"));
SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
}
scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
+SCM_PTHREAD_ATFORK_LOCK_STATIC_MUTEX (stringbuf_write_mutex);
\f
/* Copy-on-write strings.
#define SET_STRING_STRINGBUF(str,buf) (SCM_SET_CELL_OBJECT_1(str,buf))
#define SET_STRING_START(str,start) (SCM_SET_CELL_WORD_2(str,start))
-#define IS_STRING(str) (SCM_NIMP(str) && SCM_TYP7(str) == STRING_TAG)
+#define IS_STRING(str) (SCM_HAS_TYP7 (str, STRING_TAG))
/* Read-only strings.
*/
SCM
scm_i_make_string (size_t len, char **charsp, int read_only_p)
{
- SCM buf = make_stringbuf (len);
+ static SCM null_stringbuf = SCM_BOOL_F;
+ SCM buf;
SCM res;
+
+ if (len == 0)
+ {
+ if (SCM_UNLIKELY (scm_is_false (null_stringbuf)))
+ {
+ null_stringbuf = make_stringbuf (0);
+ SET_STRINGBUF_SHARED (null_stringbuf);
+ }
+ buf = null_stringbuf;
+ }
+ else
+ buf = make_stringbuf (len);
+
if (charsp)
*charsp = (char *) STRINGBUF_CHARS (buf);
res = scm_double_cell (read_only_p ? RO_STRING_TAG : STRING_TAG,
SCM
scm_i_substring (SCM str, size_t start, size_t end)
{
- SCM buf;
- size_t str_start;
- get_str_buf_start (&str, &buf, &str_start);
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- SET_STRINGBUF_SHARED (buf);
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
- return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
- (scm_t_bits)str_start + start,
- (scm_t_bits) end - start);
+ if (start == end)
+ return scm_i_make_string (0, NULL, 0);
+ else
+ {
+ SCM buf;
+ size_t str_start;
+ get_str_buf_start (&str, &buf, &str_start);
+ scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
+ SET_STRINGBUF_SHARED (buf);
+ scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
+ return scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
+ (scm_t_bits)str_start + start,
+ (scm_t_bits) end - start);
+ }
}
SCM
scm_i_substring_read_only (SCM str, size_t start, size_t end)
{
- SCM buf;
- size_t str_start;
- get_str_buf_start (&str, &buf, &str_start);
- scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
- SET_STRINGBUF_SHARED (buf);
- scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
- return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
- (scm_t_bits)str_start + start,
- (scm_t_bits) end - start);
+ if (start == end)
+ return scm_i_make_string (0, NULL, 1);
+ else
+ {
+ SCM buf;
+ size_t str_start;
+ get_str_buf_start (&str, &buf, &str_start);
+ scm_i_pthread_mutex_lock (&stringbuf_write_mutex);
+ SET_STRINGBUF_SHARED (buf);
+ scm_i_pthread_mutex_unlock (&stringbuf_write_mutex);
+ return scm_double_cell (RO_STRING_TAG, SCM_UNPACK(buf),
+ (scm_t_bits)str_start + start,
+ (scm_t_bits) end - start);
+ }
}
SCM
scm_i_substring_copy (SCM str, size_t start, size_t end)
{
- size_t len = end - start;
- SCM buf, my_buf;
- size_t str_start;
- get_str_buf_start (&str, &buf, &str_start);
- if (scm_i_is_narrow_string (str))
- {
- my_buf = make_stringbuf (len);
- memcpy (STRINGBUF_CHARS (my_buf),
- STRINGBUF_CHARS (buf) + str_start + start, len);
- }
+ if (start == end)
+ return scm_i_make_string (0, NULL, 0);
else
{
- my_buf = make_wide_stringbuf (len);
- u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
- (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
- + start), len);
- /* Even though this string is wide, the substring may be narrow.
- Consider adding code to narrow the string. */
+ size_t len = end - start;
+ SCM buf, my_buf, substr;
+ size_t str_start;
+ int wide = 0;
+ get_str_buf_start (&str, &buf, &str_start);
+ if (scm_i_is_narrow_string (str))
+ {
+ my_buf = make_stringbuf (len);
+ memcpy (STRINGBUF_CHARS (my_buf),
+ STRINGBUF_CHARS (buf) + str_start + start, len);
+ }
+ else
+ {
+ my_buf = make_wide_stringbuf (len);
+ u32_cpy ((scm_t_uint32 *) STRINGBUF_WIDE_CHARS (my_buf),
+ (scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf) + str_start
+ + start), len);
+ wide = 1;
+ }
+ scm_remember_upto_here_1 (buf);
+ substr = scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
+ (scm_t_bits) 0, (scm_t_bits) len);
+ if (wide)
+ scm_i_try_narrow_string (substr);
+ return substr;
}
- scm_remember_upto_here_1 (buf);
- return scm_double_cell (STRING_TAG, SCM_UNPACK (my_buf),
- (scm_t_bits) 0, (scm_t_bits) len);
}
SCM
{
if (start == 0 && end == STRING_LENGTH (str))
return str;
- else
+ else if (start == end)
+ return scm_i_make_string (0, NULL, 0);
+ else
{
size_t len = end - start;
if (IS_SH_STRING (str))
int
scm_i_is_narrow_string (SCM str)
{
+ if (IS_SH_STRING (str))
+ str = SH_STRING_STRING (str);
+
return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
}
int
scm_i_try_narrow_string (SCM str)
{
+ if (IS_SH_STRING (str))
+ str = SH_STRING_STRING (str);
+
SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
return scm_i_is_narrow_string (str);
void
scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
{
+ if (IS_SH_STRING (str))
+ {
+ p += STRING_START (str);
+ str = SH_STRING_STRING (str);
+ }
+
if (chr > 0xFF && scm_i_is_narrow_string (str))
SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
SCM buf = make_stringbuf (len);
memcpy (STRINGBUF_CHARS (buf), name, len);
- return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
- (scm_t_bits) hash, SCM_UNPACK (props));
+ return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
+ (scm_t_bits) hash, SCM_UNPACK (props));
}
/* Returns the number of characters in SYM. This may be different
"Return a newly allocated string of\n"
"length @var{k}. If @var{chr} is given, then all elements of\n"
"the string are initialized to @var{chr}, otherwise the contents\n"
- "of the @var{string} are all set to @var{#\nul}.")
+ "of the string are all set to @code{#\nul}.")
#define FUNC_NAME s_scm_make_string
{
return scm_c_make_string (scm_to_size_t (k), chr);
\f
/* String conversion to/from C. */
+static void
+decoding_error (const char *func_name, int errno_save,
+ const char *str, size_t len)
+{
+ /* Raise an error and pass the raw C string as a bytevector to the `throw'
+ handler. */
+ SCM bv;
+ signed char *buf;
+
+ buf = scm_gc_malloc_pointerless (len, "bytevector");
+ memcpy (buf, str, len);
+ bv = scm_c_take_gc_bytevector (buf, len, SCM_BOOL_F);
+
+ scm_decoding_error (func_name, errno_save,
+ "input locale conversion error", bv);
+}
+
SCM
scm_from_stringn (const char *str, size_t len, const char *encoding,
scm_t_string_failed_conversion_handler handler)
scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
if (len == (size_t) -1)
len = strlen (str);
- if (len == 0)
- return scm_nullstr;
- if (encoding == NULL)
- {
- /* If encoding is null, use Latin-1. */
- char *buf;
- res = scm_i_make_string (len, &buf, 0);
- memcpy (buf, str, len);
- return res;
- }
+ if (encoding == NULL || len == 0)
+ return scm_from_latin1_stringn (str, len);
+ else if (strcmp (encoding, "UTF-8") == 0)
+ return scm_from_utf8_stringn (str, len);
u32len = 0;
u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
NULL, &u32len);
if (SCM_UNLIKELY (u32 == NULL))
- {
- /* Raise an error and pass the raw C string as a bytevector to the `throw'
- handler. */
- SCM bv;
- signed char *buf;
-
- buf = scm_gc_malloc_pointerless (len, "bytevector");
- memcpy (buf, str, len);
- bv = scm_c_take_bytevector (buf, len, SCM_BOOL_F);
-
- scm_decoding_error (__func__, errno,
- "input locale conversion error", bv);
- }
+ decoding_error (__func__, errno, str, len);
i = 0;
while (i < u32len)
SCM
scm_from_utf8_stringn (const char *str, size_t len)
{
- return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
+ size_t i, char_len;
+ const scm_t_uint8 *ustr = (const scm_t_uint8 *) str;
+ int ascii = 1, narrow = 1;
+ SCM res;
+
+ if (len == (size_t) -1)
+ len = strlen (str);
+
+ i = 0;
+ char_len = 0;
+
+ while (i < len)
+ {
+ if (ustr[i] <= 127)
+ {
+ char_len++;
+ i++;
+ }
+ else
+ {
+ ucs4_t c;
+ int nbytes;
+
+ ascii = 0;
+
+ nbytes = u8_mbtouc (&c, ustr + i, len - i);
+
+ if (nbytes < 0)
+ /* Bad UTF-8. */
+ decoding_error (__func__, errno, str, len);
+
+ if (c > 255)
+ narrow = 0;
+
+ char_len++;
+ i += nbytes;
+ }
+ }
+
+ if (ascii)
+ {
+ char *dst;
+ res = scm_i_make_string (char_len, &dst, 0);
+ memcpy (dst, str, len);
+ }
+ else if (narrow)
+ {
+ char *dst;
+ size_t j;
+ ucs4_t c;
+
+ res = scm_i_make_string (char_len, &dst, 0);
+
+ for (i = 0, j = 0; i < len; j++)
+ {
+ i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
+ dst[j] = (signed char) c;
+ }
+ }
+ else
+ {
+ scm_t_wchar *dst;
+ size_t j;
+ ucs4_t c;
+
+ res = scm_i_make_wide_string (char_len, &dst, 0);
+
+ for (i = 0, j = 0; i < len; j++)
+ {
+ i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
+ dst[j] = c;
+ }
+ }
+
+ return res;
}
SCM
if (scm_i_is_narrow_string (str))
{
+ size_t len = scm_i_string_length (str);
+
if (lenp)
- *lenp = scm_i_string_length (str);
+ *lenp = len;
- result = scm_strdup (scm_i_string_data (str));
+ result = scm_strndup (scm_i_string_data (str), len);
}
else
result = scm_to_stringn (str, lenp, NULL,
- SCM_FAILED_CONVERSION_ERROR);
+ SCM_FAILED_CONVERSION_ERROR);
return result;
}
return scm_to_utf8_stringn (str, NULL);
}
+static size_t
+latin1_u8_strlen (const scm_t_uint8 *str, size_t len)
+{
+ size_t ret, i;
+ for (i = 0, ret = 0; i < len; i++)
+ ret += (str[i] < 128) ? 1 : 2;
+ return ret;
+}
+
+static scm_t_uint8*
+latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
+ scm_t_uint8 *u8_result, size_t *u8_lenp)
+{
+ size_t i, n;
+ size_t u8_len = latin1_u8_strlen (str, latin_len);
+
+ if (!(u8_result && u8_lenp && *u8_lenp > u8_len))
+ u8_result = scm_malloc (u8_len + 1);
+ if (u8_lenp)
+ *u8_lenp = u8_len;
+
+ for (i = 0, n = 0; i < latin_len; i++)
+ n += u8_uctomb (u8_result + n, str[i], u8_len - n);
+ if (n != u8_len)
+ abort ();
+ u8_result[n] = 0;
+
+ return u8_result;
+}
+
char *
scm_to_utf8_stringn (SCM str, size_t *lenp)
{
- return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
+ if (scm_i_is_narrow_string (str))
+ return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
+ scm_i_string_length (str),
+ NULL, lenp);
+ else
+ return (char *) u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
+ scm_i_string_length (str),
+ NULL, lenp);
}
scm_t_wchar *
SCM_VALIDATE_STRING (1, str);
if (scm_i_is_narrow_string (str))
- result = (scm_t_wchar *)
- scm_to_stringn (str, lenp, "UTF-32",
- SCM_FAILED_CONVERSION_ERROR);
+ {
+ scm_t_uint8 *codepoints;
+ size_t i, len;
+
+ codepoints = (scm_t_uint8*) scm_i_string_chars (str);
+ len = scm_i_string_length (str);
+ if (lenp)
+ *lenp = len;
+
+ result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
+ for (i = 0; i < len; i++)
+ result[i] = codepoints[i];
+ result[len] = 0;
+ }
else
{
size_t len;
}
#undef FUNC_NAME
-/* converts C scm_array of strings to SCM scm_list of strings. */
-/* If argc < 0, a null terminated scm_array is assumed. */
+/* converts C scm_array of strings to SCM scm_list of strings.
+ If argc < 0, a null terminated scm_array is assumed.
+ The current locale encoding is assumed */
SCM
scm_makfromstrs (int argc, char **argv)
{
}
/* Return a newly allocated array of char pointers to each of the strings
- in args, with a terminating NULL pointer. */
+ in args, with a terminating NULL pointer. The strings are encoded using
+ the current locale. */
char **
scm_i_allocate_string_pointers (SCM list)
#define FUNC_NAME "scm_i_allocate_string_pointers"
{
char **result;
- int len = scm_ilength (list);
+ int list_len = scm_ilength (list);
int i;
- if (len < 0)
+ if (list_len < 0)
scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
- result = scm_gc_malloc ((len + 1) * sizeof (char *),
+ result = scm_gc_malloc ((list_len + 1) * sizeof (char *),
"string pointers");
- result[len] = NULL;
+ result[list_len] = NULL;
- /* The list might be have been modified in another thread, so
+ /* The list might have been modified in another thread, so
we check LIST before each access.
*/
- for (i = 0; i < len && scm_is_pair (list); i++)
+ for (i = 0; i < list_len && scm_is_pair (list); i++)
{
- SCM str;
- size_t len;
-
- str = SCM_CAR (list);
- len = scm_c_string_length (str);
-
- result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
- memcpy (result[i], scm_i_string_chars (str), len);
+ SCM str = SCM_CAR (list);
+ size_t len; /* String length in bytes */
+ char *c_str = scm_to_locale_stringn (str, &len);
+
+ /* OPTIMIZE-ME: Right now, scm_to_locale_stringn always uses
+ scm_malloc to allocate the returned string, which must be
+ explicitly deallocated. This forces us to copy the string a
+ second time into a new buffer. Ideally there would be variants
+ of scm_to_*_stringn that can return garbage-collected buffers. */
+
+ result[i] = scm_gc_malloc_pointerless (len + 1, "string");
+ memcpy (result[i], c_str, len);
result[i][len] = '\0';
+ free (c_str);
list = SCM_CDR (list);
}
*cend = scm_to_unsigned_integer (end, *cstart, len);
}
-#if SCM_ENABLE_DEPRECATED
-
-/* When these definitions are removed, it becomes reasonable to use
- read-only strings for string literals. For that, change the reader
- to create string literals with scm_c_substring_read_only instead of
- with scm_c_substring_copy.
-*/
-
-int
-scm_i_deprecated_stringp (SCM str)
-{
- scm_c_issue_deprecation_warning
- ("SCM_STRINGP is deprecated. Use scm_is_string instead.");
-
- return scm_is_string (str);
-}
-
-char *
-scm_i_deprecated_string_chars (SCM str)
-{
- char *chars;
-
- scm_c_issue_deprecation_warning
- ("SCM_STRING_CHARS is deprecated. See the manual for alternatives.");
-
- /* We don't accept shared substrings here since they are not
- null-terminated.
- */
- if (IS_SH_STRING (str))
- scm_misc_error (NULL,
- "SCM_STRING_CHARS does not work with shared substrings",
- SCM_EOL);
-
- /* We explicitly test for read-only strings to produce a better
- error message.
- */
-
- if (IS_RO_STRING (str))
- scm_misc_error (NULL,
- "SCM_STRING_CHARS does not work with read-only strings",
- SCM_EOL);
-
- /* The following is still wrong, of course...
- */
- str = scm_i_string_start_writing (str);
- chars = scm_i_string_writable_chars (str);
- scm_i_string_stop_writing ();
- return chars;
-}
-
-size_t
-scm_i_deprecated_string_length (SCM str)
-{
- scm_c_issue_deprecation_warning
- ("SCM_STRING_LENGTH is deprecated. Use scm_c_string_length instead.");
- return scm_c_string_length (str);
-}
-
-#endif
-
static SCM
string_handle_ref (scm_t_array_handle *h, size_t index)
{
void
scm_init_strings ()
{
- scm_nullstr = scm_i_make_string (0, NULL, 1);
+ scm_nullstr = scm_i_make_string (0, NULL, 0);
#include "libguile/strings.x"
}