#include "libguile/generalized-vectors.h"
#include "libguile/deprecation.h"
#include "libguile/validate.h"
+#include "libguile/private-options.h"
\f
#define STRINGBUF_SHARED(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_SHARED)
#define STRINGBUF_WIDE(buf) (SCM_CELL_WORD_0(buf) & STRINGBUF_F_WIDE)
-#define STRINGBUF_CHARS(buf) ((unsigned char *) \
+#define STRINGBUF_CONTENTS(buf) ((void *) \
SCM_CELL_OBJECT_LOC (buf, \
STRINGBUF_HEADER_SIZE))
-#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
+#define STRINGBUF_CHARS(buf) ((unsigned char *) STRINGBUF_CONTENTS (buf))
+#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CONTENTS (buf))
-#define STRINGBUF_WIDE_CHARS(buf) ((scm_t_wchar *) STRINGBUF_CHARS (buf))
+#define STRINGBUF_LENGTH(buf) (SCM_CELL_WORD_1 (buf))
#define SET_STRINGBUF_SHARED(buf) \
do \
return scm_i_is_narrow_string (str);
}
+/* Return a pointer to the raw data of the string, which can be either Latin-1
+ or UCS-4 encoded data, depending on `scm_i_is_narrow_string (STR)'. */
+const void *
+scm_i_string_data (SCM str)
+{
+ SCM buf;
+ size_t start;
+ const char *data;
+
+ get_str_buf_start (&str, &buf, &start);
+
+ data = STRINGBUF_CONTENTS (buf);
+ data += start * (scm_i_is_narrow_string (str) ? 1 : 4);
+
+ return data;
+}
+
/* Returns a pointer to the 8-bit Latin-1 encoded character array of
STR. */
const char *
/* Conversion to/from other encodings. */
SCM_SYMBOL (scm_encoding_error_key, "encoding-error");
-static void
-scm_encoding_error (const char *subr, const char *message, SCM args)
-{
- scm_error (scm_encoding_error_key, subr, message, args, SCM_BOOL_F);
+void
+scm_encoding_error (const char *subr, int err, const char *message,
+ const char *from, const char *to, SCM string_or_bv)
+{
+ /* Raise an exception that conveys all the information needed to debug the
+ problem. Only perform locale conversions that are safe; in particular,
+ don't try to display STRING_OR_BV when it's a string since converting it to
+ the output locale may fail. */
+ scm_throw (scm_encoding_error_key,
+ scm_list_n (scm_from_locale_string (subr),
+ scm_from_locale_string (message),
+ scm_from_int (err),
+ scm_from_locale_string (from),
+ scm_from_locale_string (to),
+ string_or_bv,
+ SCM_UNDEFINED));
}
SCM
NULL,
NULL, &u32len);
- if (u32 == NULL)
+ if (SCM_UNLIKELY (u32 == NULL))
{
- if (errno == ENOMEM)
- scm_memory_error ("locale string conversion");
- else
- {
- /* There are invalid sequences in the input string. */
- SCM errstr;
- char *dst;
- errstr = scm_i_make_string (len, &dst);
- memcpy (dst, str, len);
- scm_encoding_error (NULL,
- "input locale conversion error from ~s: ~s",
- scm_list_2 (scm_from_locale_string (encoding),
- errstr));
- scm_remember_upto_here_1 (errstr);
- }
+ /* Raise an error and pass the raw C string as a bytevector to the `throw'
+ handler. */
+ SCM bv;
+ signed char *buf;
+
+ buf = scm_gc_malloc_pointerless (len, "bytevector");
+ memcpy (buf, str, len);
+ bv = scm_c_take_bytevector (buf, len);
+
+ scm_encoding_error (__func__, errno,
+ "input locale conversion error",
+ encoding, "UTF-32", bv);
}
i = 0;
return res;
}
+SCM
+scm_from_latin1_stringn (const char *str, size_t len)
+{
+ return scm_from_stringn (str, len, NULL, SCM_FAILED_CONVERSION_ERROR);
+}
+
SCM
scm_from_locale_stringn (const char *str, size_t len)
{
/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXX \uXXXX
and \UXXXXXX. */
-static void
-unistring_escapes_to_guile_escapes (char **bufp, size_t *lenp)
+void
+scm_i_unistring_escapes_to_guile_escapes (char *buf, size_t *lenp)
{
char *before, *after;
size_t i, j;
- before = *bufp;
- after = *bufp;
+ before = buf;
+ after = buf;
i = 0;
j = 0;
while (i < *lenp)
}
}
*lenp = j;
- after = scm_realloc (after, j);
+}
+
+/* Change libunistring escapes (\uXXXX and \UXXXXXXXX) to \xXXXX; */
+void
+scm_i_unistring_escapes_to_r6rs_escapes (char *buf, size_t *lenp)
+{
+ char *before, *after;
+ size_t i, j;
+ /* The worst case is if the input string contains all 4-digit hex escapes.
+ "\uXXXX" (six characters) becomes "\xXXXX;" (seven characters) */
+ size_t max_out_len = (*lenp * 7) / 6 + 1;
+ size_t nzeros, ndigits;
+
+ before = buf;
+ after = alloca (max_out_len);
+ i = 0;
+ j = 0;
+ while (i < *lenp)
+ {
+ if (((i <= *lenp - 6) && before[i] == '\\' && before[i + 1] == 'u')
+ || ((i <= *lenp - 10) && before[i] == '\\' && before[i + 1] == 'U'))
+ {
+ if (before[i + 1] == 'u')
+ ndigits = 4;
+ else if (before[i + 1] == 'U')
+ ndigits = 8;
+ else
+ abort ();
+
+ /* Add the R6RS hex escape initial sequence. */
+ after[j] = '\\';
+ after[j + 1] = 'x';
+
+ /* Move string positions to the start of the hex numbers. */
+ i += 2;
+ j += 2;
+
+ /* Find the number of initial zeros in this hex number. */
+ nzeros = 0;
+ while (before[i + nzeros] == '0' && nzeros < ndigits)
+ nzeros++;
+
+ /* Copy the number, skipping initial zeros, and then move the string
+ positions. */
+ if (nzeros == ndigits)
+ {
+ after[j] = '0';
+ i += ndigits;
+ j += 1;
+ }
+ else
+ {
+ int pos;
+ for (pos = 0; pos < ndigits - nzeros; pos++)
+ after[j + pos] = tolower ((int) before[i + nzeros + pos]);
+ i += ndigits;
+ j += (ndigits - nzeros);
+ }
+
+ /* Add terminating semicolon. */
+ after[j] = ';';
+ j++;
+ }
+ else
+ {
+ after[j] = before[i];
+ i++;
+ j++;
+ }
+ }
+ *lenp = j;
+ memcpy (before, after, j);
+}
+
+char *
+scm_to_latin1_stringn (SCM str, size_t *lenp)
+{
+ return scm_to_stringn (str, lenp, NULL, SCM_FAILED_CONVERSION_ERROR);
}
char *
(enum iconv_ilseq_handler) handler, NULL,
&buf, &len);
- if (ret == 0 && handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
- unistring_escapes_to_guile_escapes (&buf, &len);
-
if (ret != 0)
- scm_encoding_error (NULL, "cannot convert to output locale ~s: \"~s\"",
- scm_list_2 (scm_from_locale_string (enc), str));
+ scm_encoding_error (__func__, errno,
+ "cannot convert to output locale",
+ "ISO-8859-1", enc, str);
}
else
{
- buf = u32_conv_to_encoding (enc,
+ buf = u32_conv_to_encoding (enc,
(enum iconv_ilseq_handler) handler,
- (scm_t_uint32 *) scm_i_string_wide_chars (str),
+ (scm_t_uint32 *) scm_i_string_wide_chars (str),
ilen,
NULL,
NULL, &len);
if (buf == NULL)
- scm_encoding_error (NULL, "cannot convert to output locale ~s: \"~s\"",
- scm_list_2 (scm_from_locale_string (enc), str));
+ scm_encoding_error (__func__, errno,
+ "cannot convert to output locale",
+ "UTF-32", enc, str);
+ }
+ if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ {
+ if (SCM_R6RS_ESCAPES_P)
+ scm_i_unistring_escapes_to_r6rs_escapes (buf, &len);
+ else
+ scm_i_unistring_escapes_to_guile_escapes (buf, &len);
- if (handler == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
- unistring_escapes_to_guile_escapes (&buf, &len);
+ buf = scm_realloc (buf, len);
}
if (lenp)
*lenp = len;