+ CONSUME_PEEKED_BYTE ();
+ buf[2] = (scm_t_uint8) byte;
+ *len = 3;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
+ | ((scm_t_wchar) buf[1] & 0x3f) << 6UL
+ | (buf[2] & 0x3f);
+ }
+ else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
+ {
+ /* 4-byte form. */
+ byte = scm_peek_byte_or_eof_unlocked (port);
+ ASSERT_NOT_EOF (byte);
+
+ if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
+ || (buf[0] == 0xf0 && byte < 0x90)
+ || (buf[0] == 0xf4 && byte > 0x8f)))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ byte = scm_peek_byte_or_eof_unlocked (port);
+ ASSERT_NOT_EOF (byte);
+
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[2] = (scm_t_uint8) byte;
+ *len = 3;
+
+ byte = scm_peek_byte_or_eof_unlocked (port);
+ ASSERT_NOT_EOF (byte);
+
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[3] = (scm_t_uint8) byte;
+ *len = 4;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
+ | ((scm_t_wchar) buf[1] & 0x3f) << 12UL
+ | ((scm_t_wchar) buf[2] & 0x3f) << 6UL
+ | (buf[3] & 0x3f);
+ }
+ else
+ goto invalid_seq;
+
+ return 0;
+
+ invalid_seq:
+ /* Here we could choose the consume the faulty byte when it's not a
+ valid starting byte, but it's not a requirement. What Section 3.9
+ of Unicode 6.0.0 mandates, though, is to not consume a byte that
+ would otherwise be a valid starting byte. */
+
+ return EILSEQ;
+
+#undef CONSUME_PEEKED_BYTE
+#undef ASSERT_NOT_EOF
+}
+
+/* Read an ISO-8859-1 codepoint (a byte) from PORT. On success, return
+ 0 and set *CODEPOINT to the codepoint that was read, fill BUF with
+ its UTF-8 representation, and set *LEN to the length in bytes.
+ Return `EILSEQ' on error. */
+static int
+get_latin1_codepoint (SCM port, scm_t_wchar *codepoint,
+ char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+ *codepoint = scm_get_byte_or_eof_unlocked (port);
+
+ if (*codepoint == EOF)
+ *len = 0;
+ else
+ {
+ *len = 1;
+ buf[0] = *codepoint;
+ }
+ return 0;
+}
+
+/* Likewise, read a byte sequence from PORT, passing it through its
+ input conversion descriptor. */
+static int
+get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
+ char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+ scm_t_iconv_descriptors *id;
+ scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
+ size_t input_size = 0;
+
+ id = scm_i_port_iconv_descriptors (port, SCM_PORT_READ);
+
+ for (;;)
+ {
+ int byte_read;
+ char *input, *output;
+ size_t input_left, output_left, done;
+
+ byte_read = scm_get_byte_or_eof_unlocked (port);
+ if (SCM_UNLIKELY (byte_read == EOF))
+ {
+ if (SCM_LIKELY (input_size == 0))
+ {
+ *codepoint = (scm_t_wchar) EOF;
+ *len = input_size;
+ return 0;
+ }
+ else
+ {
+ /* EOF found in the middle of a multibyte character. */
+ scm_i_set_pending_eof (port);
+ return EILSEQ;
+ }
+ }
+
+ buf[input_size++] = byte_read;
+
+ input = buf;
+ input_left = input_size;
+ output = (char *) utf8_buf;
+ output_left = sizeof (utf8_buf);
+
+ done = iconv (id->input_cd, &input, &input_left, &output, &output_left);
+
+ if (done == (size_t) -1)
+ {
+ int err = errno;
+ if (SCM_LIKELY (err == EINVAL))
+ /* The input byte sequence did not form a complete
+ character. Read another byte and try again. */
+ continue;
+ else
+ return err;
+ }
+ else
+ {
+ size_t output_size = sizeof (utf8_buf) - output_left;
+ if (SCM_LIKELY (output_size > 0))
+ {
+ /* iconv generated output. Convert the UTF8_BUF sequence
+ to a Unicode code point. */
+ *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+ *len = input_size;
+ return 0;
+ }
+ else
+ {
+ /* iconv consumed some bytes without producing any output.
+ Most likely this means that a Unicode byte-order mark
+ (BOM) was consumed, which should not be included in the
+ returned buf. Shift any remaining bytes to the beginning
+ of buf, and continue the loop. */
+ memmove (buf, input, input_left);
+ input_size = input_left;
+ continue;
+ }
+ }
+ }
+}
+
+/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
+ with the byte representation of the codepoint in PORT's encoding, and
+ set *LEN to the length in bytes of that representation. Return 0 on
+ success and an errno value on error. */
+static SCM_C_INLINE int
+get_codepoint (SCM port, scm_t_wchar *codepoint,
+ char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+ int err;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
+
+ if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8)
+ err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
+ else if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_LATIN1)
+ err = get_latin1_codepoint (port, codepoint, buf, len);
+ else
+ err = get_iconv_codepoint (port, codepoint, buf, len);
+
+ if (SCM_LIKELY (err == 0))
+ {
+ if (SCM_UNLIKELY (pti->at_stream_start_for_bom_read))
+ {
+ /* Record that we're no longer at stream start. */
+ pti->at_stream_start_for_bom_read = 0;
+ if (pt->rw_random)
+ pti->at_stream_start_for_bom_write = 0;
+
+ /* If we just read a BOM in an encoding that recognizes them,
+ then silently consume it and read another code point. */
+ if (SCM_UNLIKELY
+ (*codepoint == SCM_UNICODE_BOM
+ && (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
+ || strcmp (pt->encoding, "UTF-16") == 0
+ || strcmp (pt->encoding, "UTF-32") == 0)))
+ return get_codepoint (port, codepoint, buf, len);
+ }
+ update_port_lf (*codepoint, port);
+ }
+ else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
+ {
+ *codepoint = '?';
+ err = 0;
+ update_port_lf (*codepoint, port);
+ }
+
+ return err;
+}
+
+/* Read a codepoint from PORT and return it. */
+scm_t_wchar
+scm_getc_unlocked (SCM port)
+#define FUNC_NAME "scm_getc"
+{
+ int err;
+ size_t len;
+ scm_t_wchar codepoint;
+ char buf[SCM_MBCHAR_BUF_SIZE];
+
+ err = get_codepoint (port, &codepoint, buf, &len);
+ if (SCM_UNLIKELY (err != 0))
+ /* At this point PORT should point past the invalid encoding, as per
+ R6RS-lib Section 8.2.4. */
+ scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
+
+ return codepoint;
+}
+#undef FUNC_NAME
+
+scm_t_wchar
+scm_getc (SCM port)
+{
+ scm_i_pthread_mutex_t *lock;
+ scm_t_wchar ret;
+
+ scm_c_lock_port (port, &lock);
+ ret = scm_getc_unlocked (port);
+ if (lock)
+ scm_i_pthread_mutex_unlock (lock);
+
+
+ return ret;
+}
+
+SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
+ (SCM port),
+ "Return the next character available from @var{port}, updating\n"
+ "@var{port} to point to the following character. If no more\n"
+ "characters are available, the end-of-file object is returned.\n"
+ "\n"
+ "When @var{port}'s data cannot be decoded according to its\n"
+ "character encoding, a @code{decoding-error} is raised and\n"
+ "@var{port} points past the erroneous byte sequence.\n")
+#define FUNC_NAME s_scm_read_char
+{
+ scm_t_wchar c;
+ if (SCM_UNBNDP (port))
+ port = scm_current_input_port ();
+ SCM_VALIDATE_OPINPORT (1, port);
+ c = scm_getc_unlocked (port);
+ if (EOF == c)
+ return SCM_EOF_VAL;
+ return SCM_MAKE_CHAR (c);
+}
+#undef FUNC_NAME
+
+
+\f
+
+/* Pushback. */
+\f
+
+
+static void
+scm_i_unget_bytes_unlocked (const unsigned char *buf, size_t len, SCM port)
+#define FUNC_NAME "scm_unget_bytes"
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ size_t old_len, new_len;
+
+ scm_i_clear_pending_eof (port);
+
+ if (pt->read_buf != pt->putback_buf)
+ /* switch to the put-back buffer. */
+ {
+ if (pt->putback_buf == NULL)
+ {
+ pt->putback_buf_size = (len > SCM_INITIAL_PUTBACK_BUF_SIZE
+ ? len : SCM_INITIAL_PUTBACK_BUF_SIZE);
+ pt->putback_buf
+ = (unsigned char *) scm_gc_malloc_pointerless
+ (pt->putback_buf_size, "putback buffer");
+ }
+
+ pt->saved_read_buf = pt->read_buf;
+ pt->saved_read_pos = pt->read_pos;
+ pt->saved_read_end = pt->read_end;
+ pt->saved_read_buf_size = pt->read_buf_size;
+
+ /* Put read_pos at the end of the buffer, so that ungets will not
+ have to shift the buffer contents each time. */
+ pt->read_buf = pt->putback_buf;
+ pt->read_pos = pt->read_end = pt->putback_buf + pt->putback_buf_size;
+ pt->read_buf_size = pt->putback_buf_size;
+ }
+
+ old_len = pt->read_end - pt->read_pos;
+ new_len = old_len + len;
+
+ if (new_len > pt->read_buf_size)
+ /* The putback buffer needs to be enlarged. */
+ {
+ size_t new_buf_size;
+ unsigned char *new_buf, *new_end, *new_pos;
+
+ new_buf_size = pt->read_buf_size * 2;
+ if (new_buf_size < new_len)
+ new_buf_size = new_len;
+
+ new_buf = (unsigned char *)
+ scm_gc_malloc_pointerless (new_buf_size, "putback buffer");
+
+ /* Put the bytes at the end of the buffer, so that future
+ ungets won't need to shift the buffer. */
+ new_end = new_buf + new_buf_size;
+ new_pos = new_end - old_len;
+ memcpy (new_pos, pt->read_pos, old_len);
+
+ pt->read_buf = pt->putback_buf = new_buf;
+ pt->read_pos = new_pos;
+ pt->read_end = new_end;
+ pt->read_buf_size = pt->putback_buf_size = new_buf_size;
+ }
+ else if (pt->read_buf + len < pt->read_pos)
+ /* If needed, shift the existing buffer contents up.
+ This should not happen unless some external code
+ manipulates the putback buffer pointers. */
+ {
+ unsigned char *new_end = pt->read_buf + pt->read_buf_size;
+ unsigned char *new_pos = new_end - old_len;
+
+ memmove (new_pos, pt->read_pos, old_len);
+ pt->read_pos = new_pos;
+ pt->read_end = new_end;
+ }
+
+ /* Move read_pos back and copy the bytes there. */
+ pt->read_pos -= len;
+ memcpy (pt->read_buf + (pt->read_pos - pt->read_buf), buf, len);
+
+ if (pt->rw_active == SCM_PORT_WRITE)
+ scm_flush (port);
+
+ if (pt->rw_random)
+ pt->rw_active = SCM_PORT_READ;
+}
+#undef FUNC_NAME
+
+void
+scm_unget_bytes_unlocked (const unsigned char *buf, size_t len, SCM port)