-/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009 Free Software Foundation, Inc.
+/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
# include <config.h>
#endif
+#include <alloca.h>
#include <string.h>
#include <stdio.h>
#include <ctype.h>
+#include <uninorm.h>
#include <unistr.h>
+#include <uniconv.h>
+
+#include "striconveh.h"
#include "libguile/_scm.h"
#include "libguile/chars.h"
#include "libguile/root.h"
#include "libguile/strings.h"
+#include "libguile/error.h"
+#include "libguile/generalized-vectors.h"
#include "libguile/deprecation.h"
#include "libguile/validate.h"
-#include "libguile/dynwind.h"
+#include "libguile/private-options.h"
\f
* cow-strings, but it failed randomly with more than 10 threads, say.
* I couldn't figure out what went wrong, so I used the conservative
* approach implemented below.
- *
- * A stringbuf needs to know its length, but only so that it can be
- * reported when the stringbuf is freed.
- *
- * There are 3 storage strategies for stringbufs: inline, outline, and
- * wide.
- *
- * Inline strings are small 8-bit strings stored within the double
- * cell itself. Outline strings are larger 8-bit strings with GC
- * allocated storage. Wide strings are 32-bit strings with allocated
- * storage.
*
- * There was little value in making wide string inlineable, since
- * there is only room for three inlined 32-bit characters. Thus wide
- * stringbufs are never inlined.
+ * There are 2 storage strategies for stringbufs: 8-bit and wide. 8-bit
+ * strings are ISO-8859-1-encoded strings; wide strings are 32-bit (UCS-4)
+ * strings.
*/
-#define STRINGBUF_F_SHARED 0x100
-#define STRINGBUF_F_INLINE 0x200
-#define STRINGBUF_F_WIDE 0x400 /* If true, strings have UCS-4
- encoding. Otherwise, strings
- are Latin-1. */
+/* The size in words of the stringbuf header (type tag + size). */
+#define STRINGBUF_HEADER_SIZE 2U
+
+#define STRINGBUF_HEADER_BYTES (STRINGBUF_HEADER_SIZE * sizeof (SCM))
+
+#define STRINGBUF_F_SHARED SCM_I_STRINGBUF_F_SHARED
+#define STRINGBUF_F_WIDE SCM_I_STRINGBUF_F_WIDE
#define STRINGBUF_TAG scm_tc7_stringbuf
#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
-#define STRINGBUF_INLINE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_INLINE)
#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
-#define STRINGBUF_OUTLINE_CHARS(buf) ((char *)SCM_CELL_WORD_1(buf))
-#define STRINGBUF_OUTLINE_LENGTH(buf) (SCM_CELL_WORD_2(buf))
-#define STRINGBUF_INLINE_CHARS(buf) ((char *)SCM_CELL_OBJECT_LOC(buf,1))
-#define STRINGBUF_INLINE_LENGTH(buf) (((size_t)SCM_CELL_WORD_0(buf))>>16)
-
-#define STRINGBUF_CHARS(buf) (STRINGBUF_INLINE (buf) \
- ? STRINGBUF_INLINE_CHARS (buf) \
- : STRINGBUF_OUTLINE_CHARS (buf))
-
-#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *)SCM_CELL_WORD_1(buf))
-#define STRINGBUF_LENGTH(buf) (STRINGBUF_INLINE (buf) \
- ? STRINGBUF_INLINE_LENGTH (buf) \
- : STRINGBUF_OUTLINE_LENGTH (buf))
-
-#define STRINGBUF_MAX_INLINE_LEN (3*sizeof(scm_t_bits))
-
-#define SET_STRINGBUF_SHARED(buf) \
- (SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED))
-
-#if SCM_STRING_LENGTH_HISTOGRAM
+#define STRINGBUF_CONTENTS(buf) ((void *) \
+ SCM_CELL_OBJECT_LOC (buf, \
+ STRINGBUF_HEADER_SIZE))
+#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
+#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
+
+#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
+
+#define SET_STRINGBUF_SHARED(buf) \
+ do \
+ { \
+ /* Don't modify BUF if it's already marked as shared since it might be \
+ a read-only, statically allocated stringbuf. */ \
+ if (SCM_LIKELY (!STRINGBUF_SHARED (buf))) \
+ SCM_SET_CELL_WORD_0 ((buf), SCM_CELL_WORD_0 (buf) | STRINGBUF_F_SHARED); \
+ } \
+ while (0)
+
+#ifdef SCM_STRING_LENGTH_HISTOGRAM
static size_t lenhist[1001];
#endif
can be dropped.
*/
-#if SCM_STRING_LENGTH_HISTOGRAM
+ SCM buf;
+
+#ifdef SCM_STRING_LENGTH_HISTOGRAM
if (len < 1000)
lenhist[len]++;
else
lenhist[1000]++;
#endif
- if (len <= STRINGBUF_MAX_INLINE_LEN-1)
- {
- return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_INLINE | (len << 16),
- 0, 0, 0);
- }
- else
- {
- char *mem = scm_gc_malloc (len+1, "string");
- mem[len] = '\0';
- return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) mem,
- (scm_t_bits) len, (scm_t_bits) 0);
- }
+ buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + len + 1,
+ "string"));
+
+ SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG);
+ SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
+
+ STRINGBUF_CHARS (buf)[len] = 0;
+
+ return buf;
}
/* Make a stringbuf with space for LEN 32-bit UCS-4-encoded
static SCM
make_wide_stringbuf (size_t len)
{
- scm_t_wchar *mem;
-#if SCM_STRING_LENGTH_HISTOGRAM
+ SCM buf;
+ size_t raw_len;
+
+#ifdef SCM_STRING_LENGTH_HISTOGRAM
if (len < 1000)
lenhist[len]++;
else
lenhist[1000]++;
#endif
- mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
- mem[len] = 0;
- return scm_double_cell (STRINGBUF_TAG | STRINGBUF_F_WIDE, (scm_t_bits) mem,
- (scm_t_bits) len, (scm_t_bits) 0);
-}
+ raw_len = (len + 1) * sizeof (scm_t_wchar);
+ buf = PTR2SCM (scm_gc_malloc_pointerless (STRINGBUF_HEADER_BYTES + raw_len,
+ "string"));
-/* Return a new stringbuf whose underlying storage consists of the LEN+1
- octets pointed to by STR (the last octet is zero). */
-SCM
-scm_i_take_stringbufn (char *str, size_t len)
-{
- scm_gc_register_collectable_memory (str, len + 1, "stringbuf");
+ SCM_SET_CELL_TYPE (buf, STRINGBUF_TAG | STRINGBUF_F_WIDE);
+ SCM_SET_CELL_WORD_1 (buf, (scm_t_bits) len);
- return scm_double_cell (STRINGBUF_TAG, (scm_t_bits) str,
- (scm_t_bits) len, (scm_t_bits) 0);
-}
+ STRINGBUF_WIDE_CHARS (buf)[len] = 0;
-SCM
-scm_i_stringbuf_mark (SCM buf)
-{
- return SCM_BOOL_F;
+ return buf;
}
-void
-scm_i_stringbuf_free (SCM buf)
+/* Return a UCS-4-encoded stringbuf containing the (possibly Latin-1-encoded)
+ characters from BUF. */
+static SCM
+wide_stringbuf (SCM buf)
{
- if (!STRINGBUF_INLINE (buf))
+ SCM new_buf;
+
+ if (STRINGBUF_WIDE (buf))
+ new_buf = buf;
+ else
{
- if (!STRINGBUF_WIDE (buf))
- scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
- STRINGBUF_OUTLINE_LENGTH (buf) + 1, "string");
- else
- scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf),
- sizeof (scm_t_wchar) * (STRINGBUF_OUTLINE_LENGTH (buf)
- + 1), "string");
+ size_t i, len;
+ scm_t_wchar *mem;
+
+ len = STRINGBUF_LENGTH (buf);
+
+ new_buf = make_wide_stringbuf (len);
+
+ mem = STRINGBUF_WIDE_CHARS (new_buf);
+ for (i = 0; i < len; i++)
+ mem[i] = (scm_t_wchar) STRINGBUF_CHARS (buf)[i];
+ mem[len] = 0;
}
+ return new_buf;
}
-/* Convert a stringbuf containing 8-bit Latin-1-encoded characters to
- one containing 32-bit UCS-4-encoded characters. */
-static void
-widen_stringbuf (SCM buf)
+/* Return a Latin-1-encoded stringbuf containing the (possibly UCS-4-encoded)
+ characters from BUF, if possible. */
+static SCM
+narrow_stringbuf (SCM buf)
{
- size_t i, len;
- scm_t_wchar *mem;
-
- if (STRINGBUF_WIDE (buf))
- return;
+ SCM new_buf;
- if (STRINGBUF_INLINE (buf))
+ if (!STRINGBUF_WIDE (buf))
+ new_buf = buf;
+ else
{
- len = STRINGBUF_INLINE_LENGTH (buf);
+ size_t i, len;
+ scm_t_wchar *wmem;
+ unsigned char *mem;
+
+ len = STRINGBUF_LENGTH (buf);
+ wmem = STRINGBUF_WIDE_CHARS (buf);
- mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
for (i = 0; i < len; i++)
- mem[i] =
- (scm_t_wchar) (unsigned char) STRINGBUF_INLINE_CHARS (buf)[i];
- mem[len] = 0;
+ if (wmem[i] > 0xFF)
+ /* BUF cannot be narrowed. */
+ return buf;
- SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) ^ STRINGBUF_F_INLINE);
- SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
- SCM_SET_CELL_WORD_1 (buf, mem);
- SCM_SET_CELL_WORD_2 (buf, len);
- }
- else
- {
- len = STRINGBUF_OUTLINE_LENGTH (buf);
+ new_buf = make_stringbuf (len);
- mem = scm_gc_malloc (sizeof (scm_t_wchar) * (len + 1), "string");
+ mem = STRINGBUF_CHARS (new_buf);
for (i = 0; i < len; i++)
- mem[i] =
- (scm_t_wchar) (unsigned char) STRINGBUF_OUTLINE_CHARS (buf)[i];
+ mem[i] = (unsigned char) wmem[i];
mem[len] = 0;
-
- scm_gc_free (STRINGBUF_OUTLINE_CHARS (buf), len + 1, "string");
-
- SCM_SET_CELL_WORD_0 (buf, SCM_CELL_WORD_0 (buf) | STRINGBUF_F_WIDE);
- SCM_SET_CELL_WORD_1 (buf, mem);
- SCM_SET_CELL_WORD_2 (buf, len);
}
+
+ return new_buf;
}
scm_i_pthread_mutex_t stringbuf_write_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
+\f
/* Copy-on-write strings.
*/
/* Read-only strings.
*/
-#define RO_STRING_TAG (scm_tc7_string + 0x200)
+#define RO_STRING_TAG scm_tc7_ro_string
#define IS_RO_STRING(str) (SCM_CELL_TYPE(str)==RO_STRING_TAG)
/* Mutation-sharing substrings
#define IS_SH_STRING(str) (SCM_CELL_TYPE(str)==SH_STRING_TAG)
+SCM scm_nullstr;
+
/* Create a scheme string with space for LEN 8-bit Latin-1-encoded
characters. CHARSP, if not NULL, will be set to location of the
char array. */
SCM buf = make_stringbuf (len);
SCM res;
if (charsp)
- *charsp = STRINGBUF_CHARS (buf);
+ *charsp = (char *) STRINGBUF_CHARS (buf);
res = scm_double_cell (STRING_TAG, SCM_UNPACK(buf),
(scm_t_bits)0, (scm_t_bits) len);
return res;
return scm_i_substring_shared (str, start, end);
}
-SCM
-scm_i_string_mark (SCM str)
-{
- if (IS_SH_STRING (str))
- return SH_STRING_STRING (str);
- else
- return STRING_STRINGBUF (str);
-}
-
-void
-scm_i_string_free (SCM str)
-{
-}
-
+\f
/* Internal accessors
*/
return !STRINGBUF_WIDE (STRING_STRINGBUF (str));
}
+/* Try to coerce a string to be narrow. It if is narrow already, do
+ nothing. If it is wide, shrink it to narrow if none of its
+ characters are above 0xFF. Return true if the string is narrow or
+ was made to be narrow. */
+int
+scm_i_try_narrow_string (SCM str)
+{
+ SET_STRING_STRINGBUF (str, narrow_stringbuf (STRING_STRINGBUF (str)));
+
+ return scm_i_is_narrow_string (str);
+}
+
+/* Return a pointer to the raw data of the string, which can be either Latin-1
+ or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
+const void *
+scm_i_string_data (SCM str)
+{
+ SCM buf;
+ size_t start;
+ const char *data;
+
+ get_str_buf_start (&str, &buf, &start);
+
+ data = STRINGBUF_CONTENTS (buf);
+ data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
+
+ return data;
+}
+
/* Returns a pointer to the 8-bit Latin-1 encoded character array of
STR. */
const char *
size_t start;
get_str_buf_start (&str, &buf, &start);
if (scm_i_is_narrow_string (str))
- return STRINGBUF_CHARS (buf) + start;
+ return (const char *) STRINGBUF_CHARS (buf) + start;
else
scm_misc_error (NULL, "Invalid read access of chars of wide string: ~s",
scm_list_1 (str));
get_str_buf_start (&str, &buf, &start);
if (!scm_i_is_narrow_string (str))
- return STRINGBUF_WIDE_CHARS (buf) + start;
+ return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf) + start;
else
scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
scm_list_1 (str));
(scm_t_uint32 *) (STRINGBUF_WIDE_CHARS (buf)
+ STRING_START (str)), len);
}
- scm_i_thread_put_to_sleep ();
+
SET_STRING_STRINGBUF (str, new_buf);
start -= STRING_START (str);
+
+ /* FIXME: The following operations are not atomic, so other threads
+ looking at STR may see an inconsistent state. Nevertheless it can't
+ hurt much since (i) accessing STR while it is being mutated can't
+ yield a crash, and (ii) concurrent accesses to STR should be
+ protected by a mutex at the application level. The latter may not
+ apply when STR != ORIG_STR, though. */
SET_STRING_START (str, 0);
- scm_i_thread_wake_up ();
+ SET_STRING_STRINGBUF (str, new_buf);
buf = new_buf;
get_str_buf_start (&str, &buf, &start);
if (scm_i_is_narrow_string (str))
- return STRINGBUF_CHARS (buf) + start;
+ return (char *) STRINGBUF_CHARS (buf) + start;
else
scm_misc_error (NULL, "Invalid write access of chars of wide string: ~s",
scm_list_1 (str));
if (!scm_i_is_narrow_string (str))
return STRINGBUF_WIDE_CHARS (buf) + start;
else
- scm_misc_error (NULL, "Invalid read access of chars of narrow string: ~s",
+ scm_misc_error (NULL, "Invalid write access of chars of narrow string: ~s",
scm_list_1 (str));
}
return scm_i_string_wide_chars (str)[x];
}
+/* Returns index+1 of the first char in STR that matches C, or
+ 0 if the char is not found. */
+int
+scm_i_string_contains_char (SCM str, char ch)
+{
+ size_t i;
+ size_t len = scm_i_string_length (str);
+
+ i = 0;
+ if (scm_i_is_narrow_string (str))
+ {
+ while (i < len)
+ {
+ if (scm_i_string_chars (str)[i] == ch)
+ return i+1;
+ i++;
+ }
+ }
+ else
+ {
+ while (i < len)
+ {
+ if (scm_i_string_wide_chars (str)[i]
+ == (unsigned char) ch)
+ return i+1;
+ i++;
+ }
+ }
+ return 0;
+}
+
+int
+scm_i_string_strcmp (SCM sstr, size_t start_x, const char *cstr)
+{
+ if (scm_i_is_narrow_string (sstr))
+ {
+ const char *a = scm_i_string_chars (sstr) + start_x;
+ const char *b = cstr;
+ return strncmp (a, b, strlen(b));
+ }
+ else
+ {
+ size_t i;
+ const scm_t_wchar *a = scm_i_string_wide_chars (sstr) + start_x;
+ const char *b = cstr;
+ for (i = 0; i < strlen (b); i++)
+ {
+ if (a[i] != (unsigned char) b[i])
+ return 1;
+ }
+ }
+ return 0;
+}
+
/* Set the Pth character of STR to UCS-4 codepoint CHR. */
void
scm_i_string_set_x (SCM str, size_t p, scm_t_wchar chr)
{
if (chr > 0xFF && scm_i_is_narrow_string (str))
- widen_stringbuf (STRING_STRINGBUF (str));
+ SET_STRING_STRINGBUF (str, wide_stringbuf (STRING_STRINGBUF (str)));
if (scm_i_is_narrow_string (str))
{
char *dst = scm_i_string_writable_chars (str);
- dst[p] = (char) (unsigned char) chr;
+ dst[p] = chr;
}
else
{
}
}
+\f
/* Symbols.
-
+
Basic symbol creation and accessing is done here, the rest is in
symbols.[hc]. This has been done to keep stringbufs and the
internals of strings and string-like objects confined to this file.
SCM buf = make_stringbuf (len);
memcpy (STRINGBUF_CHARS (buf), name, len);
- return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
- (scm_t_bits) hash, SCM_UNPACK (props));
-}
-
-/* Return a new symbol that uses the LEN bytes pointed to by NAME as its
- underlying storage. */
-SCM
-scm_i_c_take_symbol (char *name, size_t len,
- scm_t_bits flags, unsigned long hash, SCM props)
-{
- SCM buf = scm_i_take_stringbufn (name, len);
-
- return scm_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
- (scm_t_bits) hash, SCM_UNPACK (props));
+ return scm_immutable_double_cell (scm_tc7_symbol | flags, SCM_UNPACK (buf),
+ (scm_t_bits) hash, SCM_UNPACK (props));
}
/* Returns the number of characters in SYM. This may be different
buf = SYMBOL_STRINGBUF (sym);
if (!STRINGBUF_WIDE (buf))
- return STRINGBUF_CHARS (buf);
+ return (const char *) STRINGBUF_CHARS (buf);
else
scm_misc_error (NULL, "Invalid access of chars of a wide symbol ~S",
scm_list_1 (sym));
buf = SYMBOL_STRINGBUF (sym);
if (STRINGBUF_WIDE (buf))
- return STRINGBUF_WIDE_CHARS (buf);
+ return (const scm_t_wchar *) STRINGBUF_WIDE_CHARS (buf);
else
scm_misc_error (NULL, "Invalid access of chars of a narrow symbol ~S",
scm_list_1 (sym));
}
-SCM
-scm_i_symbol_mark (SCM sym)
-{
- scm_gc_mark (SYMBOL_STRINGBUF (sym));
- return SCM_CELL_OBJECT_3 (sym);
-}
-
-void
-scm_i_symbol_free (SCM sym)
-{
-}
-
SCM
scm_i_symbol_substring (SCM sym, size_t start, size_t end)
{
"The number of characters in this stringbuf\n"
"@item stringbuf-shared\n"
"@code{#t} if this stringbuf is shared\n"
- "@item stringbuf-inline\n"
- "@code{#t} if this stringbuf's characters are stored in the\n"
- "cell itself, or @code{#f} if they were allocated in memory\n"
"@item stringbuf-wide\n"
"@code{#t} if this stringbuf's characters are stored in a\n"
"32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
"@end table")
#define FUNC_NAME s_scm_sys_string_dump
{
- SCM e1, e2, e3, e4, e5, e6, e7, e8, e9, e10;
+ SCM e1, e2, e3, e4, e5, e6, e7, e8, e9;
SCM buf;
SCM_VALIDATE_STRING (1, str);
/* String info */
- e1 = scm_cons (scm_from_locale_symbol ("string"),
+ e1 = scm_cons (scm_from_latin1_symbol ("string"),
str);
- e2 = scm_cons (scm_from_locale_symbol ("start"),
+ e2 = scm_cons (scm_from_latin1_symbol ("start"),
scm_from_size_t (STRING_START (str)));
- e3 = scm_cons (scm_from_locale_symbol ("length"),
+ e3 = scm_cons (scm_from_latin1_symbol ("length"),
scm_from_size_t (STRING_LENGTH (str)));
if (IS_SH_STRING (str))
{
- e4 = scm_cons (scm_from_locale_symbol ("shared"),
+ e4 = scm_cons (scm_from_latin1_symbol ("shared"),
SH_STRING_STRING (str));
buf = STRING_STRINGBUF (SH_STRING_STRING (str));
}
else
{
- e4 = scm_cons (scm_from_locale_symbol ("shared"),
+ e4 = scm_cons (scm_from_latin1_symbol ("shared"),
SCM_BOOL_F);
buf = STRING_STRINGBUF (str);
}
if (IS_RO_STRING (str))
- e5 = scm_cons (scm_from_locale_symbol ("read-only"),
+ e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
SCM_BOOL_T);
else
- e5 = scm_cons (scm_from_locale_symbol ("read-only"),
+ e5 = scm_cons (scm_from_latin1_symbol ("read-only"),
SCM_BOOL_F);
-
+
/* Stringbuf info */
if (!STRINGBUF_WIDE (buf))
{
char *cbuf;
SCM sbc = scm_i_make_string (len, &cbuf);
memcpy (cbuf, STRINGBUF_CHARS (buf), len);
- e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
+ e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
sbc);
}
else
SCM sbc = scm_i_make_wide_string (len, &cbuf);
u32_cpy ((scm_t_uint32 *) cbuf,
(scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
- e6 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
+ e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
sbc);
}
- e7 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
+ e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
scm_from_size_t (STRINGBUF_LENGTH (buf)));
if (STRINGBUF_SHARED (buf))
- e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
+ e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
SCM_BOOL_T);
else
- e8 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
- SCM_BOOL_F);
- if (STRINGBUF_INLINE (buf))
- e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
- SCM_BOOL_T);
- else
- e9 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
+ e8 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
SCM_BOOL_F);
if (STRINGBUF_WIDE (buf))
- e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
- SCM_BOOL_T);
+ e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
+ SCM_BOOL_T);
else
- e10 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
- SCM_BOOL_F);
+ e9 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
+ SCM_BOOL_F);
- return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, e10, SCM_UNDEFINED);
+ return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, e9, SCM_UNDEFINED);
}
#undef FUNC_NAME
"The number of characters in this stringbuf\n"
"@item stringbuf-shared\n"
"@code{#t} if this stringbuf is shared\n"
- "@item stringbuf-inline\n"
- "@code{#t} if this stringbuf's characters are stored in the\n"
- "cell itself, or @code{#f} if they were allocated in memory\n"
"@item stringbuf-wide\n"
"@code{#t} if this stringbuf's characters are stored in a\n"
"32-bit buffer, or @code{#f} if they are stored in an 8-bit\n"
"@end table")
#define FUNC_NAME s_scm_sys_symbol_dump
{
- SCM e1, e2, e3, e4, e5, e6, e7, e8;
+ SCM e1, e2, e3, e4, e5, e6, e7;
SCM buf;
SCM_VALIDATE_SYMBOL (1, sym);
- e1 = scm_cons (scm_from_locale_symbol ("symbol"),
+ e1 = scm_cons (scm_from_latin1_symbol ("symbol"),
sym);
- e2 = scm_cons (scm_from_locale_symbol ("hash"),
+ e2 = scm_cons (scm_from_latin1_symbol ("hash"),
scm_from_ulong (scm_i_symbol_hash (sym)));
- e3 = scm_cons (scm_from_locale_symbol ("interned"),
+ e3 = scm_cons (scm_from_latin1_symbol ("interned"),
scm_symbol_interned_p (sym));
buf = SYMBOL_STRINGBUF (sym);
char *cbuf;
SCM sbc = scm_i_make_string (len, &cbuf);
memcpy (cbuf, STRINGBUF_CHARS (buf), len);
- e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
+ e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
sbc);
}
else
SCM sbc = scm_i_make_wide_string (len, &cbuf);
u32_cpy ((scm_t_uint32 *) cbuf,
(scm_t_uint32 *) STRINGBUF_WIDE_CHARS (buf), len);
- e4 = scm_cons (scm_from_locale_symbol ("stringbuf-chars"),
+ e4 = scm_cons (scm_from_latin1_symbol ("stringbuf-chars"),
sbc);
}
- e5 = scm_cons (scm_from_locale_symbol ("stringbuf-length"),
+ e5 = scm_cons (scm_from_latin1_symbol ("stringbuf-length"),
scm_from_size_t (STRINGBUF_LENGTH (buf)));
if (STRINGBUF_SHARED (buf))
- e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
+ e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
SCM_BOOL_T);
else
- e6 = scm_cons (scm_from_locale_symbol ("stringbuf-shared"),
- SCM_BOOL_F);
- if (STRINGBUF_INLINE (buf))
- e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
- SCM_BOOL_T);
- else
- e7 = scm_cons (scm_from_locale_symbol ("stringbuf-inline"),
+ e6 = scm_cons (scm_from_latin1_symbol ("stringbuf-shared"),
SCM_BOOL_F);
if (STRINGBUF_WIDE (buf))
- e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
+ e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
SCM_BOOL_T);
else
- e8 = scm_cons (scm_from_locale_symbol ("stringbuf-wide"),
+ e7 = scm_cons (scm_from_latin1_symbol ("stringbuf-wide"),
SCM_BOOL_F);
- return scm_list_n (e1, e2, e3, e4, e5, e6, e7, e8, SCM_UNDEFINED);
+ return scm_list_n (e1, e2, e3, e4, e5, e6, e7, SCM_UNDEFINED);
}
#undef FUNC_NAME
-#if SCM_STRING_LENGTH_HISTOGRAM
+#ifdef SCM_STRING_LENGTH_HISTOGRAM
SCM_DEFINE (scm_sys_stringbuf_hist, "%stringbuf-hist", 0, 0, 0, (void), "")
#define FUNC_NAME s_scm_sys_stringbuf_hist
"@var{chrs}.")
#define FUNC_NAME s_scm_string
{
- SCM result;
+ SCM result = SCM_BOOL_F;
SCM rest;
size_t len;
size_t p = 0;
long i;
+ int wide = 0;
/* Verify that this is a list of chars. */
i = scm_ilength (chrs);
+ SCM_ASSERT (i >= 0, chrs, SCM_ARG1, FUNC_NAME);
+
len = (size_t) i;
rest = chrs;
- SCM_ASSERT (len >= 0, chrs, SCM_ARG1, FUNC_NAME);
while (len > 0 && scm_is_pair (rest))
{
SCM elt = SCM_CAR (rest);
SCM_VALIDATE_CHAR (SCM_ARGn, elt);
+ if (SCM_CHAR (elt) > 0xFF)
+ wide = 1;
rest = SCM_CDR (rest);
len--;
scm_remember_upto_here_1 (elt);
len = (size_t) i;
rest = chrs;
- result = scm_i_make_string (len, NULL);
- result = scm_i_string_start_writing (result);
- while (len > 0 && scm_is_pair (rest))
+ if (wide == 0)
{
- SCM elt = SCM_CAR (rest);
- scm_i_string_set_x (result, p, SCM_CHAR (elt));
- p++;
- rest = SCM_CDR (rest);
- len--;
- scm_remember_upto_here_1 (elt);
+ char *buf;
+
+ result = scm_i_make_string (len, NULL);
+ result = scm_i_string_start_writing (result);
+ buf = scm_i_string_writable_chars (result);
+ while (len > 0 && scm_is_pair (rest))
+ {
+ SCM elt = SCM_CAR (rest);
+ buf[p] = (unsigned char) SCM_CHAR (elt);
+ p++;
+ rest = SCM_CDR (rest);
+ len--;
+ scm_remember_upto_here_1 (elt);
+ }
+ }
+ else
+ {
+ scm_t_wchar *buf;
+
+ result = scm_i_make_wide_string (len, NULL);
+ result = scm_i_string_start_writing (result);
+ buf = scm_i_string_writable_wide_chars (result);
+ while (len > 0 && scm_is_pair (rest))
+ {
+ SCM elt = SCM_CAR (rest);
+ buf[p] = SCM_CHAR (elt);
+ p++;
+ rest = SCM_CDR (rest);
+ len--;
+ scm_remember_upto_here_1 (elt);
+ }
}
scm_i_string_stop_writing ();
"Return a newly allocated string of\n"
"length @var{k}. If @var{chr} is given, then all elements of\n"
"the string are initialized to @var{chr}, otherwise the contents\n"
- "of the @var{string} are unspecified.")
+ "of the @var{string} are all set to @var{#\nul}.")
#define FUNC_NAME s_scm_make_string
{
return scm_c_make_string (scm_to_size_t (k), chr);
#define FUNC_NAME NULL
{
size_t p;
- SCM res = scm_i_make_string (len, NULL);
+ char *contents = NULL;
+ SCM res = scm_i_make_string (len, &contents);
- if (!SCM_UNBNDP (chr))
+ /* If no char is given, initialize string contents to NULL. */
+ if (SCM_UNBNDP (chr))
+ memset (contents, 0, len);
+ else
{
SCM_VALIDATE_CHAR (0, chr);
res = scm_i_string_start_writing (res);
}
#undef FUNC_NAME
-SCM_DEFINE (scm_string_width, "string-width", 1, 0, 0,
+SCM_DEFINE (scm_string_bytes_per_char, "string-bytes-per-char", 1, 0, 0,
(SCM string),
"Return the bytes used to represent a character in @var{string}."
"This will return 1 or 4.")
-#define FUNC_NAME s_scm_string_width
+#define FUNC_NAME s_scm_string_bytes_per_char
{
SCM_VALIDATE_STRING (1, string);
if (!scm_i_is_narrow_string (string))
}
#undef FUNC_NAME
-int
-scm_is_string (SCM obj)
+
+\f
+/* Charset conversion error handling. */
+
+SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
+SCM_SYMBOL (scm_decoding_error_key, "decoding-error");
+
+/* Raise an exception informing that character CHR could not be written
+ to PORT in its current encoding. */
+void
+scm_encoding_error (const char *subr, int err, const char *message,
+ SCM port, SCM chr)
{
- return IS_STRING (obj);
+ scm_throw (scm_encoding_error_key,
+ scm_list_n (scm_from_locale_string (subr),
+ scm_from_locale_string (message),
+ scm_from_int (err),
+ port, chr,
+ SCM_UNDEFINED));
}
+/* Raise an exception informing of an encoding error on PORT. This
+ means that a character could not be written in PORT's encoding. */
+void
+scm_decoding_error (const char *subr, int err, const char *message, SCM port)
+{
+ scm_throw (scm_decoding_error_key,
+ scm_list_n (scm_from_locale_string (subr),
+ scm_from_locale_string (message),
+ scm_from_int (err),
+ port,
+ SCM_UNDEFINED));
+}
+
+\f
+/* String conversion to/from C. */
+
SCM
-scm_from_locale_stringn (const char *str, size_t len)
+scm_from_stringn (const char *str, size_t len, const char *encoding,
+ scm_t_string_failed_conversion_handler handler)
{
+ size_t u32len, i;
+ scm_t_wchar *u32;
+ int wide = 0;
SCM res;
- char *dst;
+ /* The order of these checks is important. */
+ if (!str && len != 0)
+ scm_misc_error ("scm_from_stringn", "NULL string pointer", SCM_EOL);
if (len == (size_t) -1)
len = strlen (str);
if (len == 0)
return scm_nullstr;
- res = scm_i_make_string (len, &dst);
- memcpy (dst, str, len);
+ if (encoding == NULL)
+ {
+ /* If encoding is null, use Latin-1. */
+ char *buf;
+ res = scm_i_make_string (len, &buf);
+ memcpy (buf, str, len);
+ return res;
+ }
+
+ u32len = 0;
+ u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
+ (enum iconv_ilseq_handler)
+ handler,
+ str, len,
+ NULL,
+ NULL, &u32len);
+
+ if (SCM_UNLIKELY (u32 == NULL))
+ {
+ /* Raise an error and pass the raw C string as a bytevector to the `throw'
+ handler. */
+ SCM bv;
+ signed char *buf;
+
+ buf = scm_gc_malloc_pointerless (len, "bytevector");
+ memcpy (buf, str, len);
+ bv = scm_c_take_bytevector (buf, len);
+
+ scm_decoding_error (__func__, errno,
+ "input locale conversion error", bv);
+ }
+
+ i = 0;
+ while (i < u32len)
+ if (u32[i++] > 0xFF)
+ {
+ wide = 1;
+ break;
+ }
+
+ if (!wide)
+ {
+ char *dst;
+ res = scm_i_make_string (u32len, &dst);
+ for (i = 0; i < u32len; i ++)
+ dst[i] = (unsigned char) u32[i];
+ dst[u32len] = '\0';
+ }
+ else
+ {
+ scm_t_wchar *wdst;
+ res = scm_i_make_wide_string (u32len, &wdst);
+ u32_cpy ((scm_t_uint32 *) wdst, (scm_t_uint32 *) u32, u32len);
+ wdst[u32len] = 0;
+ }
+
+ free (u32);
return res;
}
SCM
scm_from_locale_string (const char *str)
{
- if (str == NULL)
- return scm_nullstr;
-
return scm_from_locale_stringn (str, -1);
}
-/* Create a new scheme string from the C string STR. The memory of
- STR may be used directly as storage for the new string. */
SCM
-scm_take_locale_stringn (char *str, size_t len)
+scm_from_locale_stringn (const char *str, size_t len)
{
- SCM buf, res;
+ const char *enc;
+ scm_t_string_failed_conversion_handler hndl;
+ SCM inport;
+ scm_t_port *pt;
- if (len == (size_t) -1)
- len = strlen (str);
+ inport = scm_current_input_port ();
+ if (!SCM_UNBNDP (inport) && SCM_OPINPORTP (inport))
+ {
+ pt = SCM_PTAB_ENTRY (inport);
+ enc = pt->encoding;
+ hndl = pt->ilseq_handler;
+ }
else
{
- /* Ensure STR is null terminated. A realloc for 1 extra byte should
- often be satisfied from the alignment padding after the block, with
- no actual data movement. */
- str = scm_realloc (str, len + 1);
- str[len] = '\0';
+ enc = NULL;
+ hndl = SCM_FAILED_CONVERSION_ERROR;
}
- buf = scm_i_take_stringbufn (str, len);
- res = scm_double_cell (STRING_TAG,
- SCM_UNPACK (buf), (scm_t_bits) 0, (scm_t_bits) len);
+ return scm_from_stringn (str, len, enc, hndl);
+}
+
+SCM
+scm_from_latin1_string (const char *str)
+{
+ return scm_from_latin1_stringn (str, -1);
+}
+
+SCM
+scm_from_latin1_stringn (const char *str, size_t len)
+{
+ char *buf;
+ SCM result;
+
+ if (len == (size_t) -1)
+ len = strlen (str);
+
+ /* Make a narrow string and copy STR as is. */
+ result = scm_i_make_string (len, &buf);
+ memcpy (buf, str, len);
+
+ return result;
+}
+
+SCM
+scm_from_utf8_string (const char *str)
+{
+ return scm_from_utf8_stringn (str, -1);
+}
+
+SCM
+scm_from_utf8_stringn (const char *str, size_t len)
+{
+ return scm_from_stringn (str, len, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
+}
+
+SCM
+scm_from_utf32_string (const scm_t_wchar *str)
+{
+ return scm_from_utf32_stringn (str, -1);
+}
+
+SCM
+scm_from_utf32_stringn (const scm_t_wchar *str, size_t len)
+{
+ SCM result;
+ scm_t_wchar *buf;
+
+ if (len == (size_t) -1)
+ len = u32_strlen ((uint32_t *) str);
+
+ result = scm_i_make_wide_string (len, &buf);
+ memcpy (buf, str, len * sizeof (scm_t_wchar));
+ scm_i_try_narrow_string (result);
+
+ return result;
+}
+
+/* Create a new scheme string from the C string STR. The memory of
+ STR may be used directly as storage for the new string. */
+/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
+ would be to register a finalizer to eventually free(3) STR, which isn't
+ worth it. Should we just deprecate the `scm_take_' functions? */
+SCM
+scm_take_locale_stringn (char *str, size_t len)
+{
+ SCM res;
+
+ res = scm_from_locale_stringn (str, len);
+ free (str);
+
return res;
}
return scm_take_locale_stringn (str, -1);
}
-/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
- and \UXXXXXX. */
+/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
+ *LENP-byte locale-encoded string, to `\xXX', `\uXXXX', or `\UXXXXXX'.
+ Set *LENP to the size of the resulting string.
+
+ FIXME: This is a hack we should get rid of. See
+ <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00004.html>
+ for details. */
static void
-unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
+unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
{
char *before, *after;
size_t i, j;
- before = *bufp;
- after = *bufp;
+ before = buf;
+ after = buf;
i = 0;
j = 0;
while (i < *lenp)
}
}
*lenp = j;
- after = scm_realloc (after, j);
+}
+
+/* Change libunistring escapes (`\uXXXX' and `\UXXXXXXXX') in BUF, a
+ *LENP-byte locale-encoded string, to `\xXXXX;'. Set *LEN to the size
+ of the resulting string. BUF must be large enough to handle the
+ worst case when `\uXXXX' escapes (6 characters) are replaced by
+ `\xXXXX;' (7 characters). */
+static void
+unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
+{
+ char *before, *after;
+ size_t i, j;
+ /* The worst case is if the input string contains all 4-digit hex escapes.
+ "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
+ size_t max_out_len = (*lenp * 7) / 6 + 1;
+ size_t nzeros, ndigits;
+
+ before = buf;
+ after = alloca (max_out_len);
+ i = 0;
+ j = 0;
+ while (i < *lenp)
+ {
+ if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
+ || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
+ {
+ if (before[i + 1] == 'u')
+ ndigits = 4;
+ else if (before[i + 1] == 'U')
+ ndigits = 8;
+ else
+ abort ();
+
+ /* Add the R6RS hex escape initial sequence. */
+ after[j] = '\\';
+ after[j + 1] = 'x';
+
+ /* Move string positions to the start of the hex numbers. */
+ i += 2;
+ j += 2;
+
+ /* Find the number of initial zeros in this hex number. */
+ nzeros = 0;
+ while (before[i + nzeros] == '0' && nzeros < ndigits)
+ nzeros++;
+
+ /* Copy the number, skipping initial zeros, and then move the string
+ positions. */
+ if (nzeros == ndigits)
+ {
+ after[j] = '0';
+ i += ndigits;
+ j += 1;
+ }
+ else
+ {
+ int pos;
+ for (pos = 0; pos < ndigits - nzeros; pos++)
+ after[j + pos] = tolower ((int) before[i + nzeros + pos]);
+ i += ndigits;
+ j += (ndigits - nzeros);
+ }
+
+ /* Add terminating semicolon. */
+ after[j] = ';';
+ j++;
+ }
+ else
+ {
+ after[j] = before[i];
+ i++;
+ j++;
+ }
+ }
+ *lenp = j;
+ memcpy (before, after, j);
}
char *
-scm_to_locale_stringn (SCM str, size_t * lenp)
+scm_to_locale_string (SCM str)
{
+ return scm_to_locale_stringn (str, NULL);
+}
+
+char *
+scm_to_locale_stringn (SCM str, size_t *lenp)
+{
+ SCM outport;
+ scm_t_port *pt;
const char *enc;
- /* In the future, enc will hold the port's encoding. */
- enc = NULL;
+ outport = scm_current_output_port ();
+ if (!SCM_UNBNDP (outport) && SCM_OPOUTPORTP (outport))
+ {
+ pt = SCM_PTAB_ENTRY (outport);
+ enc = pt->encoding;
+ }
+ else
+ enc = NULL;
- return scm_to_stringn (str, lenp, enc, iconveh_escape_sequence);
+ return scm_to_stringn (str, lenp,
+ enc,
+ scm_i_get_conversion_strategy (SCM_BOOL_F));
}
-/* Low-level scheme to C string conversion function. */
char *
-scm_to_stringn (SCM str, size_t * lenp, const char *encoding,
- enum iconv_ilseq_handler handler)
+scm_to_latin1_string (SCM str)
+{
+ return scm_to_latin1_stringn (str, NULL);
+}
+
+char *
+scm_to_latin1_stringn (SCM str, size_t *lenp)
+#define FUNC_NAME "scm_to_latin1_stringn"
+{
+ char *result;
+
+ SCM_VALIDATE_STRING (1, str);
+
+ if (scm_i_is_narrow_string (str))
+ {
+ if (lenp)
+ *lenp = scm_i_string_length (str);
+
+ result = scm_strdup (scm_i_string_data (str));
+ }
+ else
+ result = scm_to_stringn (str, lenp, NULL,
+ SCM_FAILED_CONVERSION_ERROR);
+
+ return result;
+}
+#undef FUNC_NAME
+
+char *
+scm_to_utf8_string (SCM str)
+{
+ return scm_to_utf8_stringn (str, NULL);
+}
+
+char *
+scm_to_utf8_stringn (SCM str, size_t *lenp)
+{
+ return scm_to_stringn (str, lenp, "UTF-8", SCM_FAILED_CONVERSION_ERROR);
+}
+
+scm_t_wchar *
+scm_to_utf32_string (SCM str)
+{
+ return scm_to_utf32_stringn (str, NULL);
+}
+
+scm_t_wchar *
+scm_to_utf32_stringn (SCM str, size_t *lenp)
+#define FUNC_NAME "scm_to_utf32_stringn"
+{
+ scm_t_wchar *result;
+
+ SCM_VALIDATE_STRING (1, str);
+
+ if (scm_i_is_narrow_string (str))
+ result = (scm_t_wchar *)
+ scm_to_stringn (str, lenp, "UTF-32",
+ SCM_FAILED_CONVERSION_ERROR);
+ else
+ {
+ size_t len;
+
+ len = scm_i_string_length (str);
+ if (lenp)
+ *lenp = len;
+
+ result = scm_malloc ((len + 1) * sizeof (scm_t_wchar));
+ memcpy (result, scm_i_string_wide_chars (str),
+ len * sizeof (scm_t_wchar));
+ result[len] = 0;
+ }
+
+ return result;
+}
+#undef FUNC_NAME
+
+/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
+ according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
+ the returned buffer. If the conversion to ENCODING fails, apply the strategy
+ defined by HANDLER. */
+char *
+scm_to_stringn (SCM str, size_t *lenp, const char *encoding,
+ scm_t_string_failed_conversion_handler handler)
{
- static const char iso[11] = "ISO-8859-1";
char *buf;
size_t ilen, len, i;
+ int ret;
+ const char *enc;
if (!scm_is_string (str))
scm_wrong_type_arg_msg (NULL, 0, str, "string");
*lenp = 0;
return buf;
}
-
+
if (lenp == NULL)
for (i = 0; i < ilen; i++)
if (scm_i_string_ref (str, i) == '\0')
"string contains #\\nul character: ~S",
scm_list_1 (str));
- if (scm_i_is_narrow_string (str))
+ if (scm_i_is_narrow_string (str) && (encoding == NULL))
{
+ /* If using native Latin-1 encoding, just copy the string
+ contents. */
if (lenp)
{
buf = scm_malloc (ilen);
}
}
-
+
buf = NULL;
len = 0;
- buf = u32_conv_to_encoding (iso,
- handler,
- (scm_t_uint32 *) scm_i_string_wide_chars (str),
- ilen, NULL, NULL, &len);
- if (buf == NULL)
- scm_misc_error (NULL, "cannot convert to output locale ~s: \"~s\"",
- scm_list_2 (scm_from_locale_string (iso), str));
-
- if (handler == iconveh_escape_sequence)
- unistring_escapes_to_guile_escapes (&buf, &len);
+ enc = encoding;
+ if (enc == NULL)
+ enc = "ISO-8859-1";
+ if (scm_i_is_narrow_string (str))
+ {
+ ret = mem_iconveh (scm_i_string_chars (str), ilen,
+ "ISO-8859-1", enc,
+ (enum iconv_ilseq_handler) handler, NULL,
+ &buf, &len);
+
+ if (ret != 0)
+ scm_encoding_error (__func__, errno,
+ "cannot convert narrow string to output locale",
+ SCM_BOOL_F,
+ /* FIXME: Faulty character unknown. */
+ SCM_BOOL_F);
+ }
+ else
+ {
+ buf = u32_conv_to_encoding (enc,
+ (enum iconv_ilseq_handler) handler,
+ (scm_t_uint32 *) scm_i_string_wide_chars (str),
+ ilen,
+ NULL,
+ NULL, &len);
+ if (buf == NULL)
+ scm_encoding_error (__func__, errno,
+ "cannot convert wide string to output locale",
+ SCM_BOOL_F,
+ /* FIXME: Faulty character unknown. */
+ SCM_BOOL_F);
+ }
+ if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ {
+ if (SCM_R6RS_ESCAPES_P)
+ {
+ /* The worst case is if the input string contains all 4-digit
+ hex escapes. "\uXXXX" (six characters) becomes "\xXXXX;"
+ (seven characters). Make BUF large enough to hold
+ that. */
+ buf = scm_realloc (buf, (len * 7) / 6 + 1);
+ unistring_escapes_to_r6rs_escapes (buf, &len);
+ }
+ else
+ unistring_escapes_to_guile_escapes (buf, &len);
+ buf = scm_realloc (buf, len);
+ }
if (lenp)
*lenp = len;
else
return buf;
}
-char *
-scm_to_locale_string (SCM str)
-{
- return scm_to_locale_stringn (str, NULL);
-}
-
size_t
scm_to_locale_stringbuf (SCM str, char *buf, size_t max_len)
{
return len;
}
+\f
+/* Unicode string normalization. */
+
+/* This function is a partial clone of SCM_STRING_TO_U32_BUF from
+ libguile/i18n.c. It would be useful to have this factored out into a more
+ convenient location, but its use of alloca makes that tricky to do. */
+
+static SCM
+normalize_str (SCM string, uninorm_t form)
+{
+ SCM ret;
+ scm_t_uint32 *w_str;
+ scm_t_wchar *cbuf;
+ size_t rlen, len = scm_i_string_length (string);
+
+ if (scm_i_is_narrow_string (string))
+ {
+ size_t i;
+ const char *buf = scm_i_string_chars (string);
+
+ w_str = alloca (sizeof (scm_t_wchar) * (len + 1));
+
+ for (i = 0; i < len; i ++)
+ w_str[i] = (unsigned char) buf[i];
+ w_str[len] = 0;
+ }
+ else
+ w_str = (scm_t_uint32 *) scm_i_string_wide_chars (string);
+
+ w_str = u32_normalize (form, w_str, len, NULL, &rlen);
+
+ ret = scm_i_make_wide_string (rlen, &cbuf);
+ u32_cpy ((scm_t_uint32 *) cbuf, w_str, rlen);
+ free (w_str);
+
+ scm_i_try_narrow_string (ret);
+
+ return ret;
+}
+
+SCM_DEFINE (scm_string_normalize_nfc, "string-normalize-nfc", 1, 0, 0,
+ (SCM string),
+ "Returns the NFC normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfc
+{
+ SCM_VALIDATE_STRING (1, string);
+ return normalize_str (string, UNINORM_NFC);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_string_normalize_nfd, "string-normalize-nfd", 1, 0, 0,
+ (SCM string),
+ "Returns the NFD normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfd
+{
+ SCM_VALIDATE_STRING (1, string);
+ return normalize_str (string, UNINORM_NFD);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_string_normalize_nfkc, "string-normalize-nfkc", 1, 0, 0,
+ (SCM string),
+ "Returns the NFKC normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfkc
+{
+ SCM_VALIDATE_STRING (1, string);
+ return normalize_str (string, UNINORM_NFKC);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_string_normalize_nfkd, "string-normalize-nfkd", 1, 0, 0,
+ (SCM string),
+ "Returns the NFKD normalized form of @var{string}.")
+#define FUNC_NAME s_scm_string_normalize_nfkd
+{
+ SCM_VALIDATE_STRING (1, string);
+ return normalize_str (string, UNINORM_NFKD);
+}
+#undef FUNC_NAME
+
/* converts C scm_array of strings to SCM scm_list of strings. */
/* If argc < 0, a null terminated scm_array is assumed. */
SCM
char **
scm_i_allocate_string_pointers (SCM list)
+#define FUNC_NAME "scm_i_allocate_string_pointers"
{
char **result;
int len = scm_ilength (list);
if (len < 0)
scm_wrong_type_arg_msg (NULL, 0, list, "proper list");
- scm_dynwind_begin (0);
-
- result = (char **) scm_malloc ((len + 1) * sizeof (char *));
+ result = scm_gc_malloc ((len + 1) * sizeof (char *),
+ "string pointers");
result[len] = NULL;
- scm_dynwind_unwind_handler (free, result, 0);
/* The list might be have been modified in another thread, so
we check LIST before each access.
*/
for (i = 0; i < len && scm_is_pair (list); i++)
{
- result[i] = scm_to_locale_string (SCM_CAR (list));
+ SCM str;
+ size_t len;
+
+ str = SCM_CAR (list);
+ len = scm_c_string_length (str);
+
+ result[i] = scm_gc_malloc_pointerless (len + 1, "string pointers");
+ memcpy (result[i], scm_i_string_chars (str), len);
+ result[i][len] = '\0';
+
list = SCM_CDR (list);
}
- scm_dynwind_end ();
return result;
}
-
-void
-scm_i_free_string_pointers (char **pointers)
-{
- int i;
-
- for (i = 0; pointers[i]; i++)
- free (pointers[i]);
- free (pointers);
-}
+#undef FUNC_NAME
void
scm_i_get_substring_spec (size_t len,
null-terminated.
*/
if (IS_SH_STRING (str))
- scm_misc_error (NULL,
- "SCM_STRING_CHARS does not work with shared substrings.",
+ scm_misc_error (NULL,
+ "SCM_STRING_CHARS does not work with shared substrings",
SCM_EOL);
/* We explicitly test for read-only strings to produce a better
*/
if (IS_RO_STRING (str))
- scm_misc_error (NULL,
- "SCM_STRING_CHARS does not work with read-only strings.",
+ scm_misc_error (NULL,
+ "SCM_STRING_CHARS does not work with read-only strings",
SCM_EOL);
-
+
/* The following is still wrong, of course...
*/
str = scm_i_string_start_writing (str);
#endif
+static SCM
+string_handle_ref (scm_t_array_handle *h, size_t index)
+{
+ return scm_c_string_ref (h->array, index);
+}
+
+static void
+string_handle_set (scm_t_array_handle *h, size_t index, SCM val)
+{
+ scm_c_string_set_x (h->array, index, val);
+}
+
+static void
+string_get_handle (SCM v, scm_t_array_handle *h)
+{
+ h->array = v;
+ h->ndims = 1;
+ h->dims = &h->dim0;
+ h->dim0.lbnd = 0;
+ h->dim0.ubnd = scm_c_string_length (v) - 1;
+ h->dim0.inc = 1;
+ h->element_type = SCM_ARRAY_ELEMENT_TYPE_CHAR;
+ h->elements = h->writable_elements = NULL;
+}
+
+SCM_ARRAY_IMPLEMENTATION (scm_tc7_string, 0x7f,
+ string_handle_ref, string_handle_set,
+ string_get_handle)
+SCM_VECTOR_IMPLEMENTATION (SCM_ARRAY_ELEMENT_TYPE_CHAR, scm_make_string)
+
void
scm_init_strings ()
{