+/* A fluid specifying the default encoding for newly created ports. If it is
+ a string, that is the encoding. If it is #f, it is in the "native"
+ (Latin-1) encoding. */
+SCM_VARIABLE (default_port_encoding_var, "%default-port-encoding");
+
+static int scm_port_encoding_init = 0;
+
+/* Use ENCODING as the default encoding for future ports. */
+void
+scm_i_set_default_port_encoding (const char *encoding)
+{
+ if (!scm_port_encoding_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var)))
+ scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized",
+ SCM_EOL);
+
+ if (encoding == NULL
+ || c_strcasecmp (encoding, "ASCII") == 0
+ || c_strcasecmp (encoding, "ANSI_X3.4-1968") == 0
+ || c_strcasecmp (encoding, "ISO-8859-1") == 0)
+ scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F);
+ else
+ scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var),
+ scm_from_locale_string (encoding));
+}
+
+/* Return the name of the default encoding for newly created ports; a
+ return value of NULL means "ISO-8859-1". */
+const char *
+scm_i_default_port_encoding (void)
+{
+ if (!scm_port_encoding_init)
+ return NULL;
+ else if (!scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var)))
+ return NULL;
+ else
+ {
+ SCM encoding;
+
+ encoding = scm_fluid_ref (SCM_VARIABLE_REF (default_port_encoding_var));
+ if (!scm_is_string (encoding))
+ return NULL;
+ else
+ return scm_i_string_chars (encoding);
+ }
+}
+
+/* If the next LEN bytes from PORT are equal to those in BYTES, then
+ return 1, else return 0. Leave the port position unchanged. */
+static int
+looking_at_bytes (SCM port, const unsigned char *bytes, int len)
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ int i = 0;
+
+ while (i < len && scm_peek_byte_or_eof (port) == bytes[i])
+ {
+ pt->read_pos++;
+ i++;
+ }
+ scm_i_unget_bytes (bytes, i, port);
+ return (i == len);
+}
+
+static const unsigned char scm_utf8_bom[3] = {0xEF, 0xBB, 0xBF};
+static const unsigned char scm_utf16be_bom[2] = {0xFE, 0xFF};
+static const unsigned char scm_utf16le_bom[2] = {0xFF, 0xFE};
+static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF};
+static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
+
+/* Decide what byte order to use for a UTF-16 port. Return "UTF-16BE"
+ or "UTF-16LE". MODE must be either SCM_PORT_READ or SCM_PORT_WRITE,
+ and specifies which operation is about to be done. The MODE
+ determines how we will decide the byte order. We deliberately avoid
+ reading from the port unless the user is about to do so. If the user
+ is about to read, then we look for a BOM, and if present, we use it
+ to determine the byte order. Otherwise we choose big endian, as
+ recommended by the Unicode Standard. Note that the BOM (if any) is
+ not consumed here. */
+static const char *
+decide_utf16_encoding (SCM port, scm_t_port_rw_active mode)
+{
+ if (mode == SCM_PORT_READ
+ && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
+ && looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom))
+ return "UTF-16LE";
+ else
+ return "UTF-16BE";
+}
+
+/* Decide what byte order to use for a UTF-32 port. Return "UTF-32BE"
+ or "UTF-32LE". See the comment above 'decide_utf16_encoding' for
+ details. */
+static const char *
+decide_utf32_encoding (SCM port, scm_t_port_rw_active mode)
+{
+ if (mode == SCM_PORT_READ
+ && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
+ && looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom))
+ return "UTF-32LE";
+ else
+ return "UTF-32BE";
+}
+
+static void
+finalize_iconv_descriptors (void *ptr, void *data)
+{
+ close_iconv_descriptors (ptr);
+}
+
+static scm_t_iconv_descriptors *
+open_iconv_descriptors (const char *encoding, int reading, int writing)
+{
+ scm_t_iconv_descriptors *id;
+ iconv_t input_cd, output_cd;
+
+ input_cd = (iconv_t) -1;
+ output_cd = (iconv_t) -1;
+ if (reading)
+ {
+ /* Open an input iconv conversion descriptor, from ENCODING
+ to UTF-8. We choose UTF-8, not UTF-32, because iconv
+ implementations can typically convert from anything to
+ UTF-8, but not to UTF-32 (see
+ <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
+
+ /* Assume opening an iconv descriptor causes about 16 KB of
+ allocation. */
+ scm_gc_register_allocation (16 * 1024);
+
+ input_cd = iconv_open ("UTF-8", encoding);
+ if (input_cd == (iconv_t) -1)
+ goto invalid_encoding;
+ }
+
+ if (writing)
+ {
+ /* Assume opening an iconv descriptor causes about 16 KB of
+ allocation. */
+ scm_gc_register_allocation (16 * 1024);
+
+ output_cd = iconv_open (encoding, "UTF-8");
+ if (output_cd == (iconv_t) -1)
+ {
+ if (input_cd != (iconv_t) -1)
+ iconv_close (input_cd);
+ goto invalid_encoding;
+ }
+ }
+
+ id = scm_gc_malloc_pointerless (sizeof (*id), "iconv descriptors");
+ id->input_cd = input_cd;
+ id->output_cd = output_cd;
+
+ /* Register a finalizer to close the descriptors. */
+ scm_i_set_finalizer (id, finalize_iconv_descriptors, NULL);
+
+ return id;
+
+ invalid_encoding:
+ {
+ SCM err;
+ err = scm_from_locale_string (encoding);
+ scm_misc_error ("open_iconv_descriptors",
+ "invalid or unknown character encoding ~s",
+ scm_list_1 (err));
+ }
+}
+
+static void
+close_iconv_descriptors (scm_t_iconv_descriptors *id)
+{
+ if (id->input_cd != (iconv_t) -1)
+ iconv_close (id->input_cd);
+ if (id->output_cd != (iconv_t) -1)
+ iconv_close (id->output_cd);
+ id->input_cd = (void *) -1;
+ id->output_cd = (void *) -1;
+}
+
+/* Return the iconv_descriptors, initializing them if necessary. MODE
+ must be either SCM_PORT_READ or SCM_PORT_WRITE, and specifies which
+ operation is about to be done. We deliberately avoid reading from
+ the port unless the user was about to do so. */
+scm_t_iconv_descriptors *
+scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode)
+{
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+
+ assert (pti->encoding_mode == SCM_PORT_ENCODING_MODE_ICONV);
+
+ if (!pti->iconv_descriptors)
+ {
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ const char *precise_encoding;
+
+ if (!pt->encoding)
+ pt->encoding = "ISO-8859-1";
+
+ /* If the specified encoding is UTF-16 or UTF-32, then make
+ that more precise by deciding what byte order to use. */
+ if (c_strcasecmp (pt->encoding, "UTF-16") == 0)
+ precise_encoding = decide_utf16_encoding (port, mode);
+ else if (c_strcasecmp (pt->encoding, "UTF-32") == 0)
+ precise_encoding = decide_utf32_encoding (port, mode);
+ else
+ precise_encoding = pt->encoding;
+
+ pti->iconv_descriptors =
+ open_iconv_descriptors (precise_encoding,
+ SCM_INPUT_PORT_P (port),
+ SCM_OUTPUT_PORT_P (port));
+ }
+
+ return pti->iconv_descriptors;
+}
+
+void
+scm_i_set_port_encoding_x (SCM port, const char *encoding)
+{
+ scm_t_port *pt;
+ scm_t_port_internal *pti;
+ scm_t_iconv_descriptors *prev;
+
+ /* Set the character encoding for this port. */
+ pt = SCM_PTAB_ENTRY (port);
+ pti = SCM_PORT_GET_INTERNAL (port);
+ prev = pti->iconv_descriptors;
+
+ /* In order to handle cases where the encoding changes mid-stream
+ (e.g. within an HTTP stream, or within a file that is composed of
+ segments with different encodings), we consider this to be "stream
+ start" for purposes of BOM handling, regardless of our actual file
+ position. */
+ pti->at_stream_start_for_bom_read = 1;
+ pti->at_stream_start_for_bom_write = 1;
+
+ if (encoding == NULL)
+ encoding = "ISO-8859-1";
+
+ /* If ENCODING is UTF-8, then no conversion descriptor is opened
+ because we do I/O ourselves. This saves 100+ KiB for each
+ descriptor. */
+ pt->encoding = scm_gc_strdup (encoding, "port");
+ if (c_strcasecmp (encoding, "UTF-8") == 0)
+ pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
+ else
+ pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
+
+ pti->iconv_descriptors = NULL;
+ if (prev)
+ close_iconv_descriptors (prev);
+}
+
+SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
+ (SCM port),
+ "Returns, as a string, the character encoding that @var{port}\n"
+ "uses to interpret its input and output.\n")
+#define FUNC_NAME s_scm_port_encoding
+{
+ scm_t_port *pt;
+ const char *enc;
+
+ SCM_VALIDATE_PORT (1, port);
+
+ pt = SCM_PTAB_ENTRY (port);
+ enc = pt->encoding;
+ if (enc)
+ return scm_from_locale_string (pt->encoding);
+ else
+ return SCM_BOOL_F;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_encoding_x, "set-port-encoding!", 2, 0, 0,
+ (SCM port, SCM enc),
+ "Sets the character encoding that will be used to interpret all\n"
+ "port I/O. New ports are created with the encoding\n"
+ "appropriate for the current locale if @code{setlocale} has \n"
+ "been called or ISO-8859-1 otherwise\n"
+ "and this procedure can be used to modify that encoding.\n")
+#define FUNC_NAME s_scm_set_port_encoding_x
+{
+ char *enc_str;
+
+ SCM_VALIDATE_PORT (1, port);
+ SCM_VALIDATE_STRING (2, enc);
+
+ enc_str = scm_to_locale_string (enc);
+ scm_i_set_port_encoding_x (port, enc_str);
+ free (enc_str);
+
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+
+/* A fluid specifying the default conversion handler for newly created
+ ports. Its value should be one of the symbols below. */
+SCM_VARIABLE (default_conversion_strategy_var,
+ "%default-port-conversion-strategy");
+
+/* Whether the above fluid is initialized. */
+static int scm_conversion_strategy_init = 0;
+
+/* The possible conversion strategies. */
+SCM_SYMBOL (sym_error, "error");
+SCM_SYMBOL (sym_substitute, "substitute");
+SCM_SYMBOL (sym_escape, "escape");
+
+/* Return the default failed encoding conversion policy for new created
+ ports. */
+scm_t_string_failed_conversion_handler
+scm_i_default_port_conversion_handler (void)
+{
+ scm_t_string_failed_conversion_handler handler;
+
+ if (!scm_conversion_strategy_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (default_conversion_strategy_var)))
+ handler = SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else
+ {
+ SCM fluid, value;
+
+ fluid = SCM_VARIABLE_REF (default_conversion_strategy_var);
+ value = scm_fluid_ref (fluid);
+
+ if (scm_is_eq (sym_substitute, value))
+ handler = SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else if (scm_is_eq (sym_escape, value))
+ handler = SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE;
+ else
+ /* Default to 'error also when the fluid's value is not one of
+ the valid symbols. */
+ handler = SCM_FAILED_CONVERSION_ERROR;
+ }
+
+ return handler;
+}
+
+/* Use HANDLER as the default conversion strategy for future ports. */
+void
+scm_i_set_default_port_conversion_handler (scm_t_string_failed_conversion_handler
+ handler)
+{
+ SCM strategy;
+
+ if (!scm_conversion_strategy_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (default_conversion_strategy_var)))
+ scm_misc_error (NULL, "tried to set conversion strategy fluid before it is initialized",
+ SCM_EOL);
+
+ switch (handler)
+ {
+ case SCM_FAILED_CONVERSION_ERROR:
+ strategy = sym_error;
+ break;
+
+ case SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE:
+ strategy = sym_escape;
+ break;
+
+ case SCM_FAILED_CONVERSION_QUESTION_MARK:
+ strategy = sym_substitute;
+ break;
+
+ default:
+ abort ();
+ }
+
+ scm_fluid_set_x (SCM_VARIABLE_REF (default_conversion_strategy_var),
+ strategy);
+}
+
+SCM_DEFINE (scm_port_conversion_strategy, "port-conversion-strategy",
+ 1, 0, 0, (SCM port),
+ "Returns the behavior of the port when handling a character that\n"
+ "is not representable in the port's current encoding.\n"
+ "It returns the symbol @code{error} if unrepresentable characters\n"
+ "should cause exceptions, @code{substitute} if the port should\n"
+ "try to replace unrepresentable characters with question marks or\n"
+ "approximate characters, or @code{escape} if unrepresentable\n"
+ "characters should be converted to string escapes.\n"
+ "\n"
+ "If @var{port} is @code{#f}, then the current default behavior\n"
+ "will be returned. New ports will have this default behavior\n"
+ "when they are created.\n")
+#define FUNC_NAME s_scm_port_conversion_strategy
+{
+ scm_t_string_failed_conversion_handler h;
+
+ if (scm_is_false (port))
+ h = scm_i_default_port_conversion_handler ();
+ else
+ {
+ scm_t_port *pt;
+
+ SCM_VALIDATE_OPPORT (1, port);
+ pt = SCM_PTAB_ENTRY (port);
+
+ h = pt->ilseq_handler;
+ }
+
+ if (h == SCM_FAILED_CONVERSION_ERROR)
+ return scm_from_latin1_symbol ("error");
+ else if (h == SCM_FAILED_CONVERSION_QUESTION_MARK)
+ return scm_from_latin1_symbol ("substitute");
+ else if (h == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ return scm_from_latin1_symbol ("escape");
+ else
+ abort ();
+
+ /* Never gets here. */
+ return SCM_UNDEFINED;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_conversion_strategy_x, "set-port-conversion-strategy!",
+ 2, 0, 0,
+ (SCM port, SCM sym),
+ "Sets the behavior of the interpreter when outputting a character\n"
+ "that is not representable in the port's current encoding.\n"
+ "@var{sym} can be either @code{'error}, @code{'substitute}, or\n"
+ "@code{'escape}. If it is @code{'error}, an error will be thrown\n"
+ "when an unconvertible character is encountered. If it is\n"
+ "@code{'substitute}, then unconvertible characters will \n"
+ "be replaced with approximate characters, or with question marks\n"
+ "if no approximately correct character is available.\n"
+ "If it is @code{'escape},\n"
+ "it will appear as a hex escape when output.\n"
+ "\n"
+ "If @var{port} is an open port, the conversion error behavior\n"
+ "is set for that port. If it is @code{#f}, it is set as the\n"
+ "default behavior for any future ports that get created in\n"
+ "this thread.\n")
+#define FUNC_NAME s_scm_set_port_conversion_strategy_x
+{
+ scm_t_string_failed_conversion_handler handler;
+
+ if (scm_is_eq (sym, sym_error))
+ handler = SCM_FAILED_CONVERSION_ERROR;
+ else if (scm_is_eq (sym, sym_substitute))
+ handler = SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else if (scm_is_eq (sym, sym_escape))
+ handler = SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE;
+ else
+ SCM_MISC_ERROR ("unknown conversion strategy ~s", scm_list_1 (sym));
+
+ if (scm_is_false (port))
+ scm_i_set_default_port_conversion_handler (handler);
+ else
+ {
+ SCM_VALIDATE_OPPORT (1, port);
+ SCM_PTAB_ENTRY (port)->ilseq_handler = handler;
+ }
+
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+
+