/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2006,
- * 2007, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc.
+ * 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
#include <uniconv.h>
#include <unistr.h>
#include <striconveh.h>
+#include <c-strcase.h>
#include <assert.h>
#include "libguile/weaks.h"
#include "libguile/fluids.h"
#include "libguile/eq.h"
+#include "libguile/alist.h"
#ifdef HAVE_STRING_H
#include <string.h>
scm_ptobs[SCM_TC2PTOBNUM (tc)].input_waiting = input_waiting;
}
-SCM
-scm_i_port_alist (SCM port)
+static void
+scm_i_set_pending_eof (SCM port)
{
- return SCM_PORT_GET_INTERNAL (port)->alist;
+ SCM_PORT_GET_INTERNAL (port)->pending_eof = 1;
}
-void
-scm_i_set_port_alist_x (SCM port, SCM alist)
+static void
+scm_i_clear_pending_eof (SCM port)
{
- SCM_PORT_GET_INTERNAL (port)->alist = alist;
+ SCM_PORT_GET_INTERNAL (port)->pending_eof = 0;
+}
+
+SCM_DEFINE (scm_i_port_property, "%port-property", 2, 0, 0,
+ (SCM port, SCM key),
+ "Return the property of @var{port} associated with @var{key}.")
+#define FUNC_NAME s_scm_i_port_property
+{
+ SCM_VALIDATE_OPPORT (1, port);
+ return scm_assq_ref (SCM_PORT_GET_INTERNAL (port)->alist, key);
}
+#undef FUNC_NAME
+
+SCM_DEFINE (scm_i_set_port_property_x, "%set-port-property!", 3, 0, 0,
+ (SCM port, SCM key, SCM value),
+ "Set the property of @var{port} associated with @var{key} to @var{value}.")
+#define FUNC_NAME s_scm_i_set_port_property_x
+{
+ scm_t_port_internal *pti;
+
+ SCM_VALIDATE_OPPORT (1, port);
+ pti = SCM_PORT_GET_INTERNAL (port);
+ pti->alist = scm_assq_set_x (pti->alist, key, value);
+ return SCM_UNSPECIFIED;
+}
+#undef FUNC_NAME
\f
}
#undef FUNC_NAME
+static SCM current_warning_port_var;
+static scm_i_pthread_once_t current_warning_port_once = SCM_I_PTHREAD_ONCE_INIT;
+
+static void
+init_current_warning_port_var (void)
+{
+ current_warning_port_var
+ = scm_c_private_variable ("guile", "current-warning-port");
+}
+
SCM
scm_current_warning_port (void)
{
- static SCM cwp_var = SCM_UNDEFINED;
- static scm_i_pthread_mutex_t cwp_var_mutex
- = SCM_I_PTHREAD_MUTEX_INITIALIZER;
-
- scm_i_scm_pthread_mutex_lock (&cwp_var_mutex);
- if (SCM_UNBNDP (cwp_var))
- cwp_var = scm_c_private_variable ("guile", "current-warning-port");
- scm_i_pthread_mutex_unlock (&cwp_var_mutex);
-
- return scm_call_0 (scm_variable_ref (cwp_var));
+ scm_i_pthread_once (¤t_warning_port_once,
+ init_current_warning_port_var);
+ return scm_call_0 (scm_variable_ref (current_warning_port_var));
}
SCM_DEFINE (scm_current_load_port, "current-load-port", 0, 0, 0,
SCM
scm_set_current_warning_port (SCM port)
{
- static SCM cwp_var = SCM_BOOL_F;
-
- if (scm_is_false (cwp_var))
- cwp_var = scm_c_private_lookup ("guile", "current-warning-port");
-
- return scm_call_1 (scm_variable_ref (cwp_var), port);
+ scm_i_pthread_once (¤t_warning_port_once,
+ init_current_warning_port_var);
+ return scm_call_1 (scm_variable_ref (current_warning_port_var), port);
}
encoding = scm_i_default_port_encoding ();
entry->ilseq_handler = scm_i_default_port_conversion_handler ();
entry->encoding = encoding ? scm_gc_strdup (encoding, "port") : NULL;
- if (encoding && strcmp (encoding, "UTF-8") == 0)
+ if (encoding && c_strcasecmp (encoding, "UTF-8") == 0)
pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
else
pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
pti->iconv_descriptors = NULL;
+ pti->at_stream_start_for_bom_read = 1;
+ pti->at_stream_start_for_bom_write = 1;
+
/* XXX These fields are not what they seem. They have been
repurposed, but cannot safely be renamed in 2.0 without breaking
ABI compatibility. This will be cleaned up in 2.2. */
entry->input_cd = pti; /* XXX pointer to the internal port structure */
entry->output_cd = NULL; /* XXX unused */
+ pti->pending_eof = 0;
pti->alist = SCM_EOL;
+ /* Until Guile 2.0.9 included, 'setvbuf' would only work on file
+ ports. Now all port types can be supported, but it's not clear
+ that port types out in wild accept having someone else fiddle with
+ their buffer. Thus, conservatively turn it off by default. */
+ pti->setvbuf = NULL;
+
SCM_SET_CELL_TYPE (z, tag);
SCM_SETPTAB_ENTRY (z, entry);
get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
{
- scm_t_iconv_descriptors *id = scm_i_port_iconv_descriptors (port);
+ scm_t_iconv_descriptors *id;
scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
size_t input_size = 0;
+ id = scm_i_port_iconv_descriptors (port, SCM_PORT_READ);
+
for (;;)
{
int byte_read;
return 0;
}
else
- /* EOF found in the middle of a multibyte character. */
- return EILSEQ;
+ {
+ /* EOF found in the middle of a multibyte character. */
+ scm_i_set_pending_eof (port);
+ return EILSEQ;
+ }
}
buf[input_size++] = byte_read;
err = get_iconv_codepoint (port, codepoint, buf, len);
if (SCM_LIKELY (err == 0))
- update_port_lf (*codepoint, port);
+ {
+ if (SCM_UNLIKELY (pti->at_stream_start_for_bom_read))
+ {
+ /* Record that we're no longer at stream start. */
+ pti->at_stream_start_for_bom_read = 0;
+ if (pt->rw_random)
+ pti->at_stream_start_for_bom_write = 0;
+
+ /* If we just read a BOM in an encoding that recognizes them,
+ then silently consume it and read another code point. */
+ if (SCM_UNLIKELY
+ (*codepoint == SCM_UNICODE_BOM
+ && (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8
+ || c_strcasecmp (pt->encoding, "UTF-16") == 0
+ || c_strcasecmp (pt->encoding, "UTF-32") == 0)))
+ return get_codepoint (port, codepoint, buf, len);
+ }
+ update_port_lf (*codepoint, port);
+ }
else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
{
*codepoint = '?';
scm_i_fill_input (SCM port)
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
assert (pt->read_pos == pt->read_end);
+ if (pti->pending_eof)
+ {
+ pti->pending_eof = 0;
+ return EOF;
+ }
+
if (pt->read_buf == pt->putback_buf)
{
/* finished reading put-back chars. */
if (pt->read_pos >= pt->read_end)
{
if (SCM_UNLIKELY (scm_i_fill_input (port) == EOF))
- return EOF;
+ {
+ scm_i_set_pending_eof (port);
+ return EOF;
+ }
}
return *pt->read_pos;
requested number of bytes. (Note that a single scm_i_fill_input
call does not guarantee to fill the whole of the port's read
buffer.) */
- if (pt->read_buf_size <= 1 && pt->encoding == NULL)
+ if (pt->read_buf_size <= 1 &&
+ (pt->encoding == NULL
+ || c_strcasecmp (pt->encoding, "ISO-8859-1") == 0))
{
/* The port that we are reading from is unbuffered - i.e. does
not have its own persistent buffer - but we have a buffer,
long offset;
scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_i_clear_pending_eof (port);
if (pt->read_buf == pt->putback_buf)
{
offset = pt->read_end - pt->read_pos;
\f
-void
-scm_unget_byte (int c, SCM port)
-#define FUNC_NAME "scm_unget_byte"
+static void
+scm_i_unget_bytes (const unsigned char *buf, size_t len, SCM port)
+#define FUNC_NAME "scm_unget_bytes"
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ size_t old_len, new_len;
- if (pt->read_buf == pt->putback_buf)
- /* already using the put-back buffer. */
- {
- /* enlarge putback_buf if necessary. */
- if (pt->read_end == pt->read_buf + pt->read_buf_size
- && pt->read_buf == pt->read_pos)
- {
- size_t new_size = pt->read_buf_size * 2;
- unsigned char *tmp = (unsigned char *)
- scm_gc_realloc (pt->putback_buf, pt->read_buf_size, new_size,
- "putback buffer");
-
- pt->read_pos = pt->read_buf = pt->putback_buf = tmp;
- pt->read_end = pt->read_buf + pt->read_buf_size;
- pt->read_buf_size = pt->putback_buf_size = new_size;
- }
-
- /* shift any existing bytes to buffer + 1. */
- if (pt->read_pos == pt->read_end)
- pt->read_end = pt->read_buf + 1;
- else if (pt->read_pos != pt->read_buf + 1)
- {
- int count = pt->read_end - pt->read_pos;
+ scm_i_clear_pending_eof (port);
- memmove (pt->read_buf + 1, pt->read_pos, count);
- pt->read_end = pt->read_buf + 1 + count;
- }
-
- pt->read_pos = pt->read_buf;
- }
- else
+ if (pt->read_buf != pt->putback_buf)
/* switch to the put-back buffer. */
{
if (pt->putback_buf == NULL)
{
+ pt->putback_buf_size = (len > SCM_INITIAL_PUTBACK_BUF_SIZE
+ ? len : SCM_INITIAL_PUTBACK_BUF_SIZE);
pt->putback_buf
= (unsigned char *) scm_gc_malloc_pointerless
- (SCM_INITIAL_PUTBACK_BUF_SIZE, "putback buffer");
- pt->putback_buf_size = SCM_INITIAL_PUTBACK_BUF_SIZE;
+ (pt->putback_buf_size, "putback buffer");
}
pt->saved_read_buf = pt->read_buf;
pt->saved_read_end = pt->read_end;
pt->saved_read_buf_size = pt->read_buf_size;
- pt->read_pos = pt->read_buf = pt->putback_buf;
- pt->read_end = pt->read_buf + 1;
+ /* Put read_pos at the end of the buffer, so that ungets will not
+ have to shift the buffer contents each time. */
+ pt->read_buf = pt->putback_buf;
+ pt->read_pos = pt->read_end = pt->putback_buf + pt->putback_buf_size;
pt->read_buf_size = pt->putback_buf_size;
}
- *pt->read_buf = c;
+ old_len = pt->read_end - pt->read_pos;
+ new_len = old_len + len;
+
+ if (new_len > pt->read_buf_size)
+ /* The putback buffer needs to be enlarged. */
+ {
+ size_t new_buf_size;
+ unsigned char *new_buf, *new_end, *new_pos;
+
+ new_buf_size = pt->read_buf_size * 2;
+ if (new_buf_size < new_len)
+ new_buf_size = new_len;
+
+ new_buf = (unsigned char *)
+ scm_gc_malloc_pointerless (new_buf_size, "putback buffer");
+
+ /* Put the bytes at the end of the buffer, so that future
+ ungets won't need to shift the buffer. */
+ new_end = new_buf + new_buf_size;
+ new_pos = new_end - old_len;
+ memcpy (new_pos, pt->read_pos, old_len);
+
+ pt->read_buf = pt->putback_buf = new_buf;
+ pt->read_pos = new_pos;
+ pt->read_end = new_end;
+ pt->read_buf_size = pt->putback_buf_size = new_buf_size;
+ }
+ else if (pt->read_buf + len < pt->read_pos)
+ /* If needed, shift the existing buffer contents up.
+ This should not happen unless some external code
+ manipulates the putback buffer pointers. */
+ {
+ unsigned char *new_end = pt->read_buf + pt->read_buf_size;
+ unsigned char *new_pos = new_end - old_len;
+
+ memmove (new_pos, pt->read_pos, old_len);
+ pt->read_pos = new_pos;
+ pt->read_end = new_end;
+ }
+
+ /* Move read_pos back and copy the bytes there. */
+ pt->read_pos -= len;
+ memcpy (pt->read_buf + (pt->read_pos - pt->read_buf), buf, len);
+
+ if (pt->rw_active == SCM_PORT_WRITE)
+ scm_flush (port);
if (pt->rw_random)
pt->rw_active = SCM_PORT_READ;
}
#undef FUNC_NAME
+void
+scm_unget_bytes (const unsigned char *buf, size_t len, SCM port)
+{
+ scm_i_unget_bytes (buf, len, port);
+}
+
+void
+scm_unget_byte (int c, SCM port)
+{
+ unsigned char byte;
+
+ byte = c;
+ scm_i_unget_bytes (&byte, 1, port);
+}
+
void
scm_ungetc (scm_t_wchar c, SCM port)
#define FUNC_NAME "scm_ungetc"
char result_buf[10];
const char *encoding;
size_t len;
- int i;
if (pt->encoding != NULL)
encoding = pt->encoding;
"conversion to port encoding failed",
SCM_BOOL_F, SCM_MAKE_CHAR (c));
- for (i = len - 1; i >= 0; i--)
- scm_unget_byte (result[i], port);
+ scm_i_unget_bytes ((unsigned char *) result, len, port);
if (SCM_UNLIKELY (result != result_buf))
free (result);
SCM result;
scm_t_wchar c;
char bytes[SCM_MBCHAR_BUF_SIZE];
- long column, line, i;
+ long column, line;
size_t len;
if (SCM_UNBNDP (port))
err = get_codepoint (port, &c, bytes, &len);
- for (i = len - 1; i >= 0; i--)
- scm_unget_byte (bytes[i], port);
+ scm_i_unget_bytes ((unsigned char *) bytes, len, port);
SCM_COL (port) = column;
SCM_LINUM (port) = line;
result = SCM_BOOL_F;
}
else if (c == EOF)
- result = SCM_EOF_VAL;
+ {
+ scm_i_set_pending_eof (port);
+ result = SCM_EOF_VAL;
+ }
else
result = SCM_MAKE_CHAR (c);
if (SCM_OPPORTP (fd_port))
{
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (fd_port);
scm_t_ptob_descriptor *ptob = scm_ptobs + SCM_PTOBNUM (fd_port);
off_t_or_off64_t off = scm_to_off_t_or_off64_t (offset);
off_t_or_off64_t rv;
SCM_MISC_ERROR ("port is not seekable",
scm_cons (fd_port, SCM_EOL));
else
- rv = ptob->seek (fd_port, off, how);
+ rv = ptob->seek (fd_port, off, how);
+
+ /* Set stream-start flags according to new position. */
+ pti->at_stream_start_for_bom_read = (rv == 0);
+ pti->at_stream_start_for_bom_write = (rv == 0);
+
+ scm_i_clear_pending_eof (fd_port);
+
return scm_from_off_t_or_off64_t (rv);
}
else /* file descriptor?. */
off_t_or_off64_t c_length = scm_to_off_t_or_off64_t (length);
scm_t_port *pt = SCM_PTAB_ENTRY (object);
scm_t_ptob_descriptor *ptob = scm_ptobs + SCM_PTOBNUM (object);
-
+
if (!ptob->truncate)
SCM_MISC_ERROR ("port is not truncatable", SCM_EOL);
+
+ scm_i_clear_pending_eof (object);
if (pt->rw_active == SCM_PORT_READ)
scm_end_input (object);
else if (pt->rw_active == SCM_PORT_WRITE)
ptob->flush (object);
-
+
ptob->truncate (object, c_length);
rv = 0;
}
SCM_EOL);
if (encoding == NULL
- || !strcmp (encoding, "ASCII")
- || !strcmp (encoding, "ANSI_X3.4-1968")
- || !strcmp (encoding, "ISO-8859-1"))
+ || c_strcasecmp (encoding, "ASCII") == 0
+ || c_strcasecmp (encoding, "ANSI_X3.4-1968") == 0
+ || c_strcasecmp (encoding, "ISO-8859-1") == 0)
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F);
else
scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var),
}
}
+/* If the next LEN bytes from PORT are equal to those in BYTES, then
+ return 1, else return 0. Leave the port position unchanged. */
+static int
+looking_at_bytes (SCM port, const unsigned char *bytes, int len)
+{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ int i = 0;
+
+ while (i < len && scm_peek_byte_or_eof (port) == bytes[i])
+ {
+ pt->read_pos++;
+ i++;
+ }
+ scm_i_unget_bytes (bytes, i, port);
+ return (i == len);
+}
+
+static const unsigned char scm_utf8_bom[3] = {0xEF, 0xBB, 0xBF};
+static const unsigned char scm_utf16be_bom[2] = {0xFE, 0xFF};
+static const unsigned char scm_utf16le_bom[2] = {0xFF, 0xFE};
+static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF};
+static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00};
+
+/* Decide what byte order to use for a UTF-16 port. Return "UTF-16BE"
+ or "UTF-16LE". MODE must be either SCM_PORT_READ or SCM_PORT_WRITE,
+ and specifies which operation is about to be done. The MODE
+ determines how we will decide the byte order. We deliberately avoid
+ reading from the port unless the user is about to do so. If the user
+ is about to read, then we look for a BOM, and if present, we use it
+ to determine the byte order. Otherwise we choose big endian, as
+ recommended by the Unicode Standard. Note that the BOM (if any) is
+ not consumed here. */
+static const char *
+decide_utf16_encoding (SCM port, scm_t_port_rw_active mode)
+{
+ if (mode == SCM_PORT_READ
+ && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
+ && looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom))
+ return "UTF-16LE";
+ else
+ return "UTF-16BE";
+}
+
+/* Decide what byte order to use for a UTF-32 port. Return "UTF-32BE"
+ or "UTF-32LE". See the comment above 'decide_utf16_encoding' for
+ details. */
+static const char *
+decide_utf32_encoding (SCM port, scm_t_port_rw_active mode)
+{
+ if (mode == SCM_PORT_READ
+ && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read
+ && looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom))
+ return "UTF-32LE";
+ else
+ return "UTF-32BE";
+}
+
static void
finalize_iconv_descriptors (void *ptr, void *data)
{
id->output_cd = (void *) -1;
}
+/* Return the iconv_descriptors, initializing them if necessary. MODE
+ must be either SCM_PORT_READ or SCM_PORT_WRITE, and specifies which
+ operation is about to be done. We deliberately avoid reading from
+ the port unless the user was about to do so. */
scm_t_iconv_descriptors *
-scm_i_port_iconv_descriptors (SCM port)
+scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode)
{
- scm_t_port *pt;
- scm_t_port_internal *pti;
-
- pt = SCM_PTAB_ENTRY (port);
- pti = SCM_PORT_GET_INTERNAL (port);
+ scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port);
assert (pti->encoding_mode == SCM_PORT_ENCODING_MODE_ICONV);
if (!pti->iconv_descriptors)
{
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ const char *precise_encoding;
+
if (!pt->encoding)
pt->encoding = "ISO-8859-1";
+
+ /* If the specified encoding is UTF-16 or UTF-32, then make
+ that more precise by deciding what byte order to use. */
+ if (c_strcasecmp (pt->encoding, "UTF-16") == 0)
+ precise_encoding = decide_utf16_encoding (port, mode);
+ else if (c_strcasecmp (pt->encoding, "UTF-32") == 0)
+ precise_encoding = decide_utf32_encoding (port, mode);
+ else
+ precise_encoding = pt->encoding;
+
pti->iconv_descriptors =
- open_iconv_descriptors (pt->encoding,
+ open_iconv_descriptors (precise_encoding,
SCM_INPUT_PORT_P (port),
SCM_OUTPUT_PORT_P (port));
}
pti = SCM_PORT_GET_INTERNAL (port);
prev = pti->iconv_descriptors;
+ /* In order to handle cases where the encoding changes mid-stream
+ (e.g. within an HTTP stream, or within a file that is composed of
+ segments with different encodings), we consider this to be "stream
+ start" for purposes of BOM handling, regardless of our actual file
+ position. */
+ pti->at_stream_start_for_bom_read = 1;
+ pti->at_stream_start_for_bom_write = 1;
+
if (encoding == NULL)
encoding = "ISO-8859-1";
/* If ENCODING is UTF-8, then no conversion descriptor is opened
because we do I/O ourselves. This saves 100+ KiB for each
descriptor. */
- if (strcmp (encoding, "UTF-8") == 0)
- {
- pt->encoding = "UTF-8";
- pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
- pti->iconv_descriptors = NULL;
- }
+ pt->encoding = scm_gc_strdup (encoding, "port");
+ if (c_strcasecmp (encoding, "UTF-8") == 0)
+ pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8;
else
- {
- /* Open descriptors before mutating the port. */
- pti->iconv_descriptors =
- open_iconv_descriptors (encoding,
- SCM_INPUT_PORT_P (port),
- SCM_OUTPUT_PORT_P (port));
- pt->encoding = scm_gc_strdup (encoding, "port");
- pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
- }
+ pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV;
+ pti->iconv_descriptors = NULL;
if (prev)
close_iconv_descriptors (prev);
}