+ return pt->iconv_descriptors;
+}
+
+/* The name of the encoding is itself encoded in ASCII. */
+void
+scm_i_set_port_encoding_x (SCM port, const char *encoding)
+{
+ scm_t_port *pt;
+ scm_t_iconv_descriptors *prev;
+
+ /* Set the character encoding for this port. */
+ pt = SCM_PTAB_ENTRY (port);
+ prev = pt->iconv_descriptors;
+
+ if (encoding_matches (encoding, "UTF-8"))
+ {
+ pt->encoding = "UTF-8";
+ pt->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
+ pt->iconv_descriptors = NULL;
+ }
+ else if (encoding_matches (encoding, "ISO-8859-1"))
+ {
+ pt->encoding = "ISO-8859-1";
+ pt->encoding_mode = SCM_PORT_ENCODING_MODE_LATIN1;
+ pt->iconv_descriptors = NULL;
+ }
+ else
+ {
+ /* Open descriptors before mutating the port. */
+ char *gc_encoding = canonicalize_encoding (encoding);
+ pt->iconv_descriptors =
+ open_iconv_descriptors (gc_encoding,
+ SCM_INPUT_PORT_P (port),
+ SCM_OUTPUT_PORT_P (port));
+ pt->encoding = gc_encoding;
+ pt->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
+ }
+
+ if (prev)
+ close_iconv_descriptors (prev);
+}
+
+SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
+ (SCM port),
+ "Returns, as a string, the character encoding that @var{port}\n"
+ "uses to interpret its input and output.\n")
+#define FUNC_NAME s_scm_port_encoding
+{
+ SCM_VALIDATE_PORT (1, port);
+
+ return scm_from_latin1_string (SCM_PTAB_ENTRY (port)->encoding);
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_encoding_x, "set-port-encoding!", 2, 0, 0,
+ (SCM port, SCM enc),
+ "Sets the character encoding that will be used to interpret all\n"
+ "port I/O. New ports are created with the encoding\n"
+ "appropriate for the current locale if @code{setlocale} has \n"
+ "been called or ISO-8859-1 otherwise\n"
+ "and this procedure can be used to modify that encoding.\n")
+#define FUNC_NAME s_scm_set_port_encoding_x
+{
+ char *enc_str;
+
+ SCM_VALIDATE_PORT (1, port);
+ SCM_VALIDATE_STRING (2, enc);
+
+ enc_str = scm_to_latin1_string (enc);
+ scm_i_set_port_encoding_x (port, enc_str);
+ free (enc_str);
+
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_port_conversion_strategy, "port-conversion-strategy",
+ 1, 0, 0, (SCM port),
+ "Returns the behavior of the port when handling a character that\n"
+ "is not representable in the port's current encoding.\n"
+ "It returns the symbol @code{error} if unrepresentable characters\n"
+ "should cause exceptions, @code{substitute} if the port should\n"
+ "try to replace unrepresentable characters with question marks or\n"
+ "approximate characters, or @code{escape} if unrepresentable\n"
+ "characters should be converted to string escapes.\n"
+ "\n"
+ "If @var{port} is @code{#f}, then the current default behavior\n"
+ "will be returned. New ports will have this default behavior\n"
+ "when they are created.\n")
+#define FUNC_NAME s_scm_port_conversion_strategy
+{
+ scm_t_string_failed_conversion_handler h;
+
+ SCM_VALIDATE_OPPORT (1, port);
+
+ if (scm_is_false (port))
+ h = scm_i_default_port_conversion_handler ();
+ else
+ {
+ scm_t_port *pt;
+
+ SCM_VALIDATE_OPPORT (1, port);
+ pt = SCM_PTAB_ENTRY (port);
+
+ h = pt->ilseq_handler;
+ }
+
+ if (h == SCM_FAILED_CONVERSION_ERROR)
+ return scm_from_latin1_symbol ("error");
+ else if (h == SCM_FAILED_CONVERSION_QUESTION_MARK)
+ return scm_from_latin1_symbol ("substitute");
+ else if (h == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
+ return scm_from_latin1_symbol ("escape");
+ else
+ abort ();
+
+ /* Never gets here. */
+ return SCM_UNDEFINED;
+}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_set_port_conversion_strategy_x, "set-port-conversion-strategy!",
+ 2, 0, 0,
+ (SCM port, SCM sym),
+ "Sets the behavior of the interpreter when outputting a character\n"
+ "that is not representable in the port's current encoding.\n"
+ "@var{sym} can be either @code{'error}, @code{'substitute}, or\n"
+ "@code{'escape}. If it is @code{'error}, an error will be thrown\n"
+ "when an unconvertible character is encountered. If it is\n"
+ "@code{'substitute}, then unconvertible characters will \n"
+ "be replaced with approximate characters, or with question marks\n"
+ "if no approximately correct character is available.\n"
+ "If it is @code{'escape},\n"
+ "it will appear as a hex escape when output.\n"
+ "\n"
+ "If @var{port} is an open port, the conversion error behavior\n"
+ "is set for that port. If it is @code{#f}, it is set as the\n"
+ "default behavior for any future ports that get created in\n"
+ "this thread.\n")
+#define FUNC_NAME s_scm_set_port_conversion_strategy_x
+{
+ scm_t_string_failed_conversion_handler handler;
+
+ if (scm_is_eq (sym, sym_error))
+ handler = SCM_FAILED_CONVERSION_ERROR;
+ else if (scm_is_eq (sym, sym_substitute))
+ handler = SCM_FAILED_CONVERSION_QUESTION_MARK;
+ else if (scm_is_eq (sym, sym_escape))
+ handler = SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE;
+ else
+ SCM_MISC_ERROR ("unknown conversion strategy ~s", scm_list_1 (sym));
+
+ if (scm_is_false (port))
+ scm_i_set_default_port_conversion_handler (handler);
+ else
+ {
+ SCM_VALIDATE_OPPORT (1, port);
+ SCM_PTAB_ENTRY (port)->ilseq_handler = handler;
+ }
+
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
+
+
+\f
+
+/* The port lock. */
+
+static void
+lock_port (void *mutex)
+{
+ scm_i_pthread_mutex_lock (mutex);
+}
+
+static void
+unlock_port (void *mutex)
+{
+ scm_i_pthread_mutex_unlock (mutex);
+}
+
+void
+scm_dynwind_lock_port (SCM port)
+#define FUNC_NAME "dynwind-lock-port"
+{
+ scm_i_pthread_mutex_t *lock;
+ SCM_VALIDATE_OPPORT (SCM_ARG1, port);
+ scm_c_lock_port (port, &lock);
+ if (lock)
+ {
+ scm_dynwind_unwind_handler (unlock_port, lock, SCM_F_WIND_EXPLICITLY);
+ scm_dynwind_rewind_handler (lock_port, lock, 0);
+ }
+}
+#undef FUNC_NAME
+
+
+\f
+
+/* Input. */
+
+int
+scm_get_byte_or_eof (SCM port)
+{
+ scm_i_pthread_mutex_t *lock;
+ int ret;
+
+ scm_c_lock_port (port, &lock);
+ ret = scm_get_byte_or_eof_unlocked (port);
+ if (lock)
+ scm_i_pthread_mutex_unlock (lock);
+
+ return ret;
+}
+
+int
+scm_peek_byte_or_eof (SCM port)
+{
+ scm_i_pthread_mutex_t *lock;
+ int ret;
+
+ scm_c_lock_port (port, &lock);
+ ret = scm_peek_byte_or_eof_unlocked (port);
+ if (lock)
+ scm_i_pthread_mutex_unlock (lock);
+
+ return ret;
+}
+
+/* scm_c_read
+ *
+ * Used by an application to read arbitrary number of bytes from an
+ * SCM port. Same semantics as libc read, except that scm_c_read only
+ * returns less than SIZE bytes if at end-of-file.
+ *
+ * Warning: Doesn't update port line and column counts! */
+
+/* This structure, and the following swap_buffer function, are used
+ for temporarily swapping a port's own read buffer, and the buffer
+ that the caller of scm_c_read provides. */
+struct port_and_swap_buffer
+{
+ scm_t_port *pt;
+ unsigned char *buffer;
+ size_t size;
+};
+
+static void
+swap_buffer (void *data)
+{
+ struct port_and_swap_buffer *psb = (struct port_and_swap_buffer *) data;
+ unsigned char *old_buf = psb->pt->read_buf;
+ size_t old_size = psb->pt->read_buf_size;
+
+ /* Make the port use (buffer, size) from the struct. */
+ psb->pt->read_pos = psb->pt->read_buf = psb->pt->read_end = psb->buffer;
+ psb->pt->read_buf_size = psb->size;
+
+ /* Save the port's old (buffer, size) in the struct. */
+ psb->buffer = old_buf;
+ psb->size = old_size;
+}
+
+size_t
+scm_c_read_unlocked (SCM port, void *buffer, size_t size)
+#define FUNC_NAME "scm_c_read"
+{
+ scm_t_port *pt;
+ size_t n_read = 0, n_available;
+ struct port_and_swap_buffer psb;
+
+ SCM_VALIDATE_OPINPORT (1, port);
+
+ pt = SCM_PTAB_ENTRY (port);
+ if (pt->rw_active == SCM_PORT_WRITE)
+ SCM_PORT_DESCRIPTOR (port)->flush (port);
+
+ if (pt->rw_random)
+ pt->rw_active = SCM_PORT_READ;
+
+ /* Take bytes first from the port's read buffer. */
+ if (pt->read_pos < pt->read_end)
+ {
+ n_available = min (size, pt->read_end - pt->read_pos);
+ memcpy (buffer, pt->read_pos, n_available);
+ buffer = (char *) buffer + n_available;
+ pt->read_pos += n_available;
+ n_read += n_available;
+ size -= n_available;
+ }
+
+ /* Avoid the scm_dynwind_* costs if we now have enough data. */
+ if (size == 0)
+ return n_read;
+
+ /* Now we will call scm_fill_input repeatedly until we have read the
+ requested number of bytes. (Note that a single scm_fill_input
+ call does not guarantee to fill the whole of the port's read
+ buffer.) */
+ if (pt->read_buf_size <= 1
+ && pt->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
+ {
+ /* The port that we are reading from is unbuffered - i.e. does
+ not have its own persistent buffer - but we have a buffer,
+ provided by our caller, that is the right size for the data
+ that is wanted. For the following scm_fill_input calls,
+ therefore, we use the buffer in hand as the port's read
+ buffer.
+
+ We need to make sure that the port's normal (1 byte) buffer
+ is reinstated in case one of the scm_fill_input () calls
+ throws an exception; we use the scm_dynwind_* API to achieve
+ that.
+
+ A consequence of this optimization is that the fill_input
+ functions can't unget characters. That'll push data to the
+ pushback buffer instead of this psb buffer. */
+#if SCM_DEBUG == 1
+ unsigned char *pback = pt->putback_buf;
+#endif
+ psb.pt = pt;
+ psb.buffer = buffer;
+ psb.size = size;
+ scm_dynwind_begin (SCM_F_DYNWIND_REWINDABLE);
+ scm_dynwind_rewind_handler (swap_buffer, &psb, SCM_F_WIND_EXPLICITLY);
+ scm_dynwind_unwind_handler (swap_buffer, &psb, SCM_F_WIND_EXPLICITLY);
+
+ /* Call scm_fill_input until we have all the bytes that we need,
+ or we hit EOF. */
+ while (pt->read_buf_size && (scm_fill_input_unlocked (port) != EOF))
+ {
+ pt->read_buf_size -= (pt->read_end - pt->read_pos);
+ pt->read_pos = pt->read_buf = pt->read_end;
+ }
+#if SCM_DEBUG == 1
+ if (pback != pt->putback_buf
+ || pt->read_buf - (unsigned char *) buffer < 0)
+ scm_misc_error (FUNC_NAME,
+ "scm_c_read must not call a fill function that pushes "
+ "back characters onto an unbuffered port", SCM_EOL);
+#endif
+ n_read += pt->read_buf - (unsigned char *) buffer;
+
+ /* Reinstate the port's normal buffer. */
+ scm_dynwind_end ();
+ }
+ else
+ {
+ /* The port has its own buffer. It is important that we use it,
+ even if it happens to be smaller than our caller's buffer, so
+ that a custom port implementation's entry points (in
+ particular, fill_input) can rely on the buffer always being
+ the same as they first set up. */
+ while (size && (scm_fill_input_unlocked (port) != EOF))
+ {
+ n_available = min (size, pt->read_end - pt->read_pos);
+ memcpy (buffer, pt->read_pos, n_available);
+ buffer = (char *) buffer + n_available;
+ pt->read_pos += n_available;
+ n_read += n_available;
+ size -= n_available;
+ }
+ }
+
+ return n_read;
+}
+#undef FUNC_NAME
+
+size_t
+scm_c_read (SCM port, void *buffer, size_t size)
+{
+ scm_i_pthread_mutex_t *lock;
+ size_t ret;
+
+ scm_c_lock_port (port, &lock);
+ ret = scm_c_read_unlocked (port, buffer, size);
+ if (lock)
+ scm_i_pthread_mutex_unlock (lock);
+
+
+ return ret;
+}
+
+/* Update the line and column number of PORT after consumption of C. */
+static inline void
+update_port_lf (scm_t_wchar c, SCM port)
+{
+ switch (c)
+ {
+ case '\a':
+ case EOF:
+ break;
+ case '\b':
+ SCM_DECCOL (port);
+ break;
+ case '\n':
+ SCM_INCLINE (port);
+ break;
+ case '\r':
+ SCM_ZEROCOL (port);
+ break;
+ case '\t':
+ SCM_TABCOL (port);
+ break;
+ default:
+ SCM_INCCOL (port);
+ break;
+ }
+}
+
+#define SCM_MBCHAR_BUF_SIZE (4)
+
+/* Convert the SIZE-byte UTF-8 sequence in UTF8_BUF to a codepoint.
+ UTF8_BUF is assumed to contain a valid UTF-8 sequence. */
+static scm_t_wchar
+utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
+{
+ scm_t_wchar codepoint;
+
+ if (utf8_buf[0] <= 0x7f)
+ {
+ assert (size == 1);
+ codepoint = utf8_buf[0];
+ }
+ else if ((utf8_buf[0] & 0xe0) == 0xc0)
+ {
+ assert (size == 2);
+ codepoint = ((scm_t_wchar) utf8_buf[0] & 0x1f) << 6UL
+ | (utf8_buf[1] & 0x3f);
+ }
+ else if ((utf8_buf[0] & 0xf0) == 0xe0)
+ {
+ assert (size == 3);
+ codepoint = ((scm_t_wchar) utf8_buf[0] & 0x0f) << 12UL
+ | ((scm_t_wchar) utf8_buf[1] & 0x3f) << 6UL
+ | (utf8_buf[2] & 0x3f);
+ }
+ else
+ {
+ assert (size == 4);
+ codepoint = ((scm_t_wchar) utf8_buf[0] & 0x07) << 18UL
+ | ((scm_t_wchar) utf8_buf[1] & 0x3f) << 12UL
+ | ((scm_t_wchar) utf8_buf[2] & 0x3f) << 6UL
+ | (utf8_buf[3] & 0x3f);
+ }
+
+ return codepoint;
+}
+
+/* Read a UTF-8 sequence from PORT. On success, return 0 and set
+ *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
+ representation, and set *LEN to the length in bytes. Return
+ `EILSEQ' on error. */
+static int
+get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
+ scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+#define ASSERT_NOT_EOF(b) \
+ if (SCM_UNLIKELY ((b) == EOF)) \
+ goto invalid_seq
+#define CONSUME_PEEKED_BYTE() \
+ pt->read_pos++
+
+ int byte;
+ scm_t_port *pt;
+
+ *len = 0;
+ pt = SCM_PTAB_ENTRY (port);
+
+ byte = scm_get_byte_or_eof_unlocked (port);
+ if (byte == EOF)
+ {
+ *codepoint = EOF;
+ return 0;
+ }
+
+ buf[0] = (scm_t_uint8) byte;
+ *len = 1;
+
+ if (buf[0] <= 0x7f)
+ /* 1-byte form. */
+ *codepoint = buf[0];
+ else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
+ {
+ /* 2-byte form. */
+ byte = scm_peek_byte_or_eof_unlocked (port);
+ ASSERT_NOT_EOF (byte);