-/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc.
+/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
name = SH_STRING_STRING (name);
start += STRING_START (name);
}
- buf = SYMBOL_STRINGBUF (name);
+ buf = STRING_STRINGBUF (name);
if (start == 0 && length == STRINGBUF_LENGTH (buf))
{
#define FUNC_NAME s_scm_string_append
{
SCM res;
- size_t len = 0;
+ size_t total = 0;
+ size_t len;
int wide = 0;
SCM l, s;
size_t i;
{
s = SCM_CAR (l);
SCM_VALIDATE_STRING (SCM_ARGn, s);
- len += scm_i_string_length (s);
+ len = scm_i_string_length (s);
+ if (((size_t) -1) - total < len)
+ scm_num_overflow (s_scm_string_append);
+ total += len;
if (!scm_i_is_narrow_string (s))
wide = 1;
}
data.narrow = NULL;
if (!wide)
- res = scm_i_make_string (len, &data.narrow, 0);
+ res = scm_i_make_string (total, &data.narrow, 0);
else
- res = scm_i_make_wide_string (len, &data.wide, 0);
+ res = scm_i_make_wide_string (total, &data.wide, 0);
for (l = args; !scm_is_null (l); l = SCM_CDR (l))
{
s = SCM_CAR (l);
SCM_VALIDATE_STRING (SCM_ARGn, s);
len = scm_i_string_length (s);
+ if (len > total)
+ SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
if (!wide)
{
memcpy (data.narrow, scm_i_string_chars (s), len);
{
if (scm_i_is_narrow_string (s))
{
- for (i = 0; i < scm_i_string_length (s); i++)
- data.wide[i] = (unsigned char) scm_i_string_chars (s)[i];
+ const char *src = scm_i_string_chars (s);
+ for (i = 0; i < len; i++)
+ data.wide[i] = (unsigned char) src[i];
}
else
u32_cpy ((scm_t_uint32 *) data.wide,
(scm_t_uint32 *) scm_i_string_wide_chars (s), len);
data.wide += len;
}
+ total -= len;
scm_remember_upto_here_1 (s);
}
+ if (total != 0)
+ SCM_MISC_ERROR ("list changed during string-append", SCM_EOL);
return res;
}
#undef FUNC_NAME
if (len == (size_t) -1)
len = strlen (str);
- if (encoding == NULL || len == 0)
- {
- /* If encoding is null (or the string is empty), use Latin-1. */
- char *buf;
- res = scm_i_make_string (len, &buf, 0);
- memcpy (buf, str, len);
- return res;
- }
+ if (strcmp (encoding, "ISO-8859-1") == 0 || len == 0)
+ return scm_from_latin1_stringn (str, len);
+ else if (strcmp (encoding, "UTF-8") == 0
+ && handler == SCM_FAILED_CONVERSION_ERROR)
+ return scm_from_utf8_stringn (str, len);
u32len = 0;
u32 = (scm_t_wchar *) u32_conv_from_encoding (encoding,
scm_from_locale_stringn (const char *str, size_t len)
{
return scm_from_stringn (str, len, locale_charset (),
- scm_i_get_conversion_strategy (SCM_BOOL_F));
+ scm_i_default_port_conversion_handler ());
}
SCM
nbytes = u8_mbtouc (&c, ustr + i, len - i);
- if (nbytes < 0)
+ if (c == 0xfffd)
/* Bad UTF-8. */
decoding_error (__func__, errno, str, len);
res = scm_i_make_string (char_len, &dst, 0);
- for (i = 0, j = 0; i < len; i++, j++)
+ for (i = 0, j = 0; i < len; j++)
{
i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
dst[j] = (signed char) c;
res = scm_i_make_wide_string (char_len, &dst, 0);
- for (i = 0, j = 0; i < len; i++, j++)
+ for (i = 0, j = 0; i < len; j++)
{
i += u8_mbtouc_unsafe (&c, ustr + i, len - i);
dst[j] = c;
return result;
}
+SCM
+scm_from_port_string (const char *str, SCM port)
+{
+ return scm_from_port_stringn (str, -1, port);
+}
+
+SCM
+scm_from_port_stringn (const char *str, size_t len, SCM port)
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+ if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
+ return scm_from_latin1_stringn (str, len);
+ else if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
+ && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
+ return scm_from_utf8_stringn (str, len);
+ else
+ return scm_from_stringn (str, len, pt->encoding, pt->ilseq_handler);
+}
+
/* Create a new scheme string from the C string STR. The memory of
STR may be used directly as storage for the new string. */
/* FIXME: GC-wise, the only way to use the memory area pointed to by STR
char *
scm_to_locale_stringn (SCM str, size_t *lenp)
{
- return scm_to_stringn (str, lenp,
+ return scm_to_stringn (str, lenp,
locale_charset (),
- scm_i_get_conversion_strategy (SCM_BOOL_F));
+ scm_i_default_port_conversion_handler ());
}
char *
return u8_result;
}
+/* UTF-8 code table
+
+ (Note that this includes code points that are not allowed by Unicode,
+ but since this function has no way to report an error, and its
+ purpose is to determine the size of destination buffers for
+ libunicode conversion functions, we err on the safe side and handle
+ everything that libunicode might conceivably handle, now or in the
+ future.)
+
+ Char. number range | UTF-8 octet sequence
+ (hexadecimal) | (binary)
+ --------------------+------------------------------------------------------
+ 0000 0000-0000 007F | 0xxxxxxx
+ 0000 0080-0000 07FF | 110xxxxx 10xxxxxx
+ 0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
+ 0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ 0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+*/
+
+static size_t
+u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
+{
+ size_t ret, i;
+
+ for (i = 0, ret = 0; i < len; i++)
+ {
+ scm_t_uint32 c = str[i];
+
+ if (c <= 0x7f)
+ ret += 1;
+ else if (c <= 0x7ff)
+ ret += 2;
+ else if (c <= 0xffff)
+ ret += 3;
+ else if (c <= 0x1fffff)
+ ret += 4;
+ else if (c <= 0x3ffffff)
+ ret += 5;
+ else
+ ret += 6;
+ }
+
+ return ret;
+}
+
char *
scm_to_utf8_stringn (SCM str, size_t *lenp)
+#define FUNC_NAME "scm_to_utf8_stringn"
{
+ SCM_VALIDATE_STRING (1, str);
+
if (scm_i_is_narrow_string (str))
return (char *) latin1_to_u8 ((scm_t_uint8 *) scm_i_string_chars (str),
scm_i_string_length (str),
NULL, lenp);
else
- return (char *) u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
- scm_i_string_length (str),
- NULL, lenp);
+ {
+ scm_t_uint32 *chars = (scm_t_uint32 *) scm_i_string_wide_chars (str);
+ scm_t_uint8 *buf, *ret;
+ size_t num_chars = scm_i_string_length (str);
+ size_t num_bytes_predicted, num_bytes_actual;
+
+ num_bytes_predicted = u32_u8_length_in_bytes (chars, num_chars);
+
+ if (lenp)
+ {
+ *lenp = num_bytes_predicted;
+ buf = scm_malloc (num_bytes_predicted);
+ }
+ else
+ {
+ buf = scm_malloc (num_bytes_predicted + 1);
+ buf[num_bytes_predicted] = 0;
+ }
+
+ num_bytes_actual = num_bytes_predicted;
+ ret = u32_to_u8 (chars, num_chars, buf, &num_bytes_actual);
+
+ if (SCM_LIKELY (ret == buf && num_bytes_actual == num_bytes_predicted))
+ return (char *) ret;
+
+ /* An error: a bad codepoint. */
+ {
+ int saved_errno = errno;
+
+ free (buf);
+ if (!saved_errno)
+ abort ();
+
+ scm_decoding_error ("scm_to_utf8_stringn", errno,
+ "invalid codepoint in string", str);
+
+ /* Not reached. */
+ return NULL;
+ }
+ }
}
+#undef FUNC_NAME
scm_t_wchar *
scm_to_utf32_string (SCM str)
}
#undef FUNC_NAME
+char *
+scm_to_port_string (SCM str, SCM port)
+{
+ return scm_to_port_stringn (str, NULL, port);
+}
+
+char *
+scm_to_port_stringn (SCM str, size_t *lenp, SCM port)
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+
+ if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1
+ && pt->ilseq_handler == SCM_FAILED_CONVERSION_ERROR)
+ return scm_to_latin1_stringn (str, lenp);
+ else if (pt->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
+ return scm_to_utf8_stringn (str, lenp);
+ else
+ return scm_to_stringn (str, lenp, pt->encoding, pt->ilseq_handler);
+}
+
/* Return a malloc(3)-allocated buffer containing the contents of STR encoded
according to ENCODING. If LENP is non-NULL, set it to the size in bytes of
the returned buffer. If the conversion to ENCODING fails, apply the strategy
"string contains #\\nul character: ~S",
scm_list_1 (str));
- if (scm_i_is_narrow_string (str) && (encoding == NULL))
+ if (scm_i_is_narrow_string (str) && strcmp (encoding, "ISO-8859-1") == 0)
{
/* If using native Latin-1 encoding, just copy the string
contents. */