-/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2007, 2008, 2009 Free Software Foundation, Inc.
- *
+/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004,
+ * 2006, 2007, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
+ *
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public License
* as published by the Free Software Foundation; either version 3 of
#include <errno.h>
#include <fcntl.h> /* for chsize on mingw */
#include <assert.h>
+#include <iconv.h>
#include <uniconv.h>
#include <unistr.h>
#include <striconveh.h>
#include "libguile/_scm.h"
#include "libguile/async.h"
+#include "libguile/deprecation.h"
#include "libguile/eval.h"
#include "libguile/fports.h" /* direct access for seek and truncate */
-#include "libguile/objects.h"
#include "libguile/goops.h"
#include "libguile/smob.h"
#include "libguile/chars.h"
#include "libguile/validate.h"
#include "libguile/ports.h"
#include "libguile/vectors.h"
-#include "libguile/weaks.h"
+#include "libguile/weak-set.h"
#include "libguile/fluids.h"
#include "libguile/eq.h"
gnulib ftruncate.c has code using fcntl F_CHSIZE and F_FREESP, which
might be possibilities if we've got other systems without ftruncate. */
-#if HAVE_CHSIZE && ! HAVE_FTRUNCATE
+#if defined HAVE_CHSIZE && ! defined HAVE_FTRUNCATE
#define ftruncate(fd, size) chsize (fd, size)
#undef HAVE_FTRUNCATE
#define HAVE_FTRUNCATE 1
* Indexes into this table are used when generating type
* tags for smobjects (if you know a tag you can get an index and conversely).
*/
-scm_t_ptob_descriptor *scm_ptobs;
-long scm_numptob;
+scm_t_ptob_descriptor *scm_ptobs = NULL;
+long scm_numptob = 0;
/* GC marker for a port with stream of SCM type. */
SCM
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
- else
- SCM_VALIDATE_OPINPORT (1, port);
+ /* It's possible to close the current input port, so validate even in
+ this case. */
+ SCM_VALIDATE_OPINPORT (1, port);
pt = SCM_PTAB_ENTRY (port);
if (pt->read_buf == pt->putback_buf)
count += pt->saved_read_end - pt->saved_read_pos;
- result = scm_i_make_string (count, &data);
- scm_take_from_input_buffers (port, data, count);
+ if (count)
+ {
+ result = scm_i_make_string (count, &data, 0);
+ scm_take_from_input_buffers (port, data, count);
+ }
+ else
+ result = scm_nullstr;
+
return result;
}
#undef FUNC_NAME
\f
/* Standard ports --- current input, output, error, and more(!). */
-static SCM cur_inport_fluid = 0;
-static SCM cur_outport_fluid = 0;
-static SCM cur_errport_fluid = 0;
-static SCM cur_loadport_fluid = 0;
+static SCM cur_inport_fluid = SCM_BOOL_F;
+static SCM cur_outport_fluid = SCM_BOOL_F;
+static SCM cur_errport_fluid = SCM_BOOL_F;
+static SCM cur_loadport_fluid = SCM_BOOL_F;
SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0,
(),
"returns the @dfn{standard input} in Unix and C terminology.")
#define FUNC_NAME s_scm_current_input_port
{
- if (cur_inport_fluid)
+ if (scm_is_true (cur_inport_fluid))
return scm_fluid_ref (cur_inport_fluid);
else
return SCM_BOOL_F;
"Unix and C terminology.")
#define FUNC_NAME s_scm_current_output_port
{
- if (cur_outport_fluid)
+ if (scm_is_true (cur_outport_fluid))
return scm_fluid_ref (cur_outport_fluid);
else
return SCM_BOOL_F;
"@dfn{standard error} in Unix and C terminology).")
#define FUNC_NAME s_scm_current_error_port
{
- if (cur_errport_fluid)
+ if (scm_is_true (cur_errport_fluid))
return scm_fluid_ref (cur_errport_fluid);
else
return SCM_BOOL_F;
We need a global registry of ports to flush them all at exit, and to
get all the ports matching a file descriptor.
*/
-SCM scm_i_port_weak_hash;
-
-scm_i_pthread_mutex_t scm_i_port_table_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER;
+SCM scm_i_port_weak_set;
\f
/* Port finalization. */
static void finalize_port (GC_PTR, GC_PTR);
-/* Register a finalizer for PORT, if needed by its port type. */
+/* Register a finalizer for PORT. */
static SCM_C_INLINE_KEYWORD void
register_finalizer_for_port (SCM port)
{
- long port_type;
-
- port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
- if (scm_ptobs[port_type].free)
- {
- GC_finalization_proc prev_finalizer;
- GC_PTR prev_finalization_data;
+ GC_finalization_proc prev_finalizer;
+ GC_PTR prev_finalization_data;
- GC_REGISTER_FINALIZER_NO_ORDER (SCM2PTR (port), finalize_port, 0,
- &prev_finalizer,
- &prev_finalization_data);
- }
+ /* Register a finalizer for PORT so that its iconv CDs get freed and
+ optionally its type's `free' function gets called. */
+ GC_REGISTER_FINALIZER_NO_ORDER (SCM_HEAP_OBJECT_BASE (port),
+ finalize_port, 0,
+ &prev_finalizer,
+ &prev_finalization_data);
}
/* Finalize the object (a port) pointed to by PTR. */
finalize_port (GC_PTR ptr, GC_PTR data)
{
long port_type;
- SCM port = PTR2SCM (ptr);
+ SCM port = SCM_PACK_POINTER (ptr);
if (!SCM_PORTP (port))
abort ();
register_finalizer_for_port (port);
else
{
+ scm_t_port *entry;
+
port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port));
if (port_type >= scm_numptob)
abort ();
is for explicit `close-port' by user. */
scm_ptobs[port_type].free (port);
+ entry = SCM_PTAB_ENTRY (port);
+
+ if (entry->input_cd != (iconv_t) -1)
+ iconv_close (entry->input_cd);
+ if (entry->output_cd != (iconv_t) -1)
+ iconv_close (entry->output_cd);
+
SCM_SETSTREAM (port, 0);
SCM_CLR_PORT_OPEN_FLAG (port);
\f
-/* This function is not and should not be thread safe. */
SCM
-scm_new_port_table_entry (scm_t_bits tag)
-#define FUNC_NAME "scm_new_port_table_entry"
+scm_c_make_port_with_encoding (scm_t_bits tag, unsigned long mode_bits,
+ const char *encoding,
+ scm_t_string_failed_conversion_handler handler,
+ scm_t_bits stream)
{
- /*
- We initialize the cell to empty, this is in case scm_gc_calloc
- triggers GC ; we don't want the GC to scan a half-finished Z.
- */
-
- SCM z = scm_cons (SCM_EOL, SCM_EOL);
- scm_t_port *entry = (scm_t_port *) scm_gc_calloc (sizeof (scm_t_port), "port");
- const char *enc;
+ SCM ret;
+ scm_t_port *entry;
+
+ entry = (scm_t_port *) scm_gc_calloc (sizeof (scm_t_port), "port");
+ ret = scm_cell (tag | mode_bits, (scm_t_bits)entry);
+
+#if SCM_USE_PTHREAD_THREADS
+ scm_i_pthread_mutex_init (&entry->lock, scm_i_pthread_mutexattr_recursive);
+#endif
entry->file_name = SCM_BOOL_F;
entry->rw_active = SCM_PORT_NEITHER;
- entry->port = z;
- /* Initialize this port with the thread's current default
- encoding. */
- if ((enc = scm_i_get_port_encoding (SCM_BOOL_F)) == NULL)
- entry->encoding = NULL;
- else
- entry->encoding = scm_gc_strdup (enc, "port");
- entry->ilseq_handler = scm_i_get_conversion_strategy (SCM_BOOL_F);
-
- SCM_SET_CELL_TYPE (z, tag);
- SCM_SETPTAB_ENTRY (z, entry);
+ entry->port = ret;
+ entry->stream = stream;
+ entry->encoding = encoding ? scm_gc_strdup (encoding, "port") : NULL;
+ /* The conversion descriptors will be opened lazily. */
+ entry->input_cd = (iconv_t) -1;
+ entry->output_cd = (iconv_t) -1;
+ entry->ilseq_handler = handler;
- scm_hashq_set_x (scm_i_port_weak_hash, z, SCM_BOOL_F);
+ scm_weak_set_add_x (scm_i_port_weak_set, ret);
/* For each new port, register a finalizer so that it port type's free
function can be invoked eventually. */
- register_finalizer_for_port (z);
+ register_finalizer_for_port (ret);
- return z;
+ return ret;
}
-#undef FUNC_NAME
-#if SCM_ENABLE_DEPRECATED==1
-SCM_API scm_t_port *
-scm_add_to_port_table (SCM port)
+SCM
+scm_c_make_port (scm_t_bits tag, unsigned long mode_bits, scm_t_bits stream)
{
- SCM z = scm_new_port_table_entry (scm_tc7_port);
- scm_t_port * pt = SCM_PTAB_ENTRY(z);
-
- pt->port = port;
- SCM_SETCAR (z, SCM_EOL);
- SCM_SETCDR (z, SCM_EOL);
- SCM_SETPTAB_ENTRY (port, pt);
- return pt;
+ return scm_c_make_port_with_encoding (tag, mode_bits,
+ scm_i_default_port_encoding (),
+ scm_i_get_conversion_strategy (SCM_BOOL_F),
+ stream);
}
-#endif
+SCM
+scm_new_port_table_entry (scm_t_bits tag)
+{
+ return scm_c_make_port (tag, 0, 0);
+}
/* Remove a port from the table and destroy it. */
-/* This function is not and should not be thread safe. */
-void
+static void
scm_i_remove_port (SCM port)
#define FUNC_NAME "scm_remove_port"
{
- scm_t_port *p = SCM_PTAB_ENTRY (port);
+ scm_t_port *p;
+ p = SCM_PTAB_ENTRY (port);
scm_port_non_buffer (p);
+ SCM_SETPTAB_ENTRY (port, 0);
+ scm_weak_set_remove_x (scm_i_port_weak_set, port);
p->putback_buf = NULL;
p->putback_buf_size = 0;
- SCM_SETPTAB_ENTRY (port, 0);
- scm_hashq_remove_x (scm_i_port_weak_hash, port);
+ if (p->input_cd != (iconv_t) -1)
+ {
+ iconv_close (p->input_cd);
+ p->input_cd = (iconv_t) -1;
+ }
+
+ if (p->output_cd != (iconv_t) -1)
+ {
+ iconv_close (p->output_cd);
+ p->output_cd = (iconv_t) -1;
+ }
}
#undef FUNC_NAME
-/* Functions for debugging. */
-#ifdef GUILE_DEBUG
-SCM_DEFINE (scm_pt_size, "pt-size", 0, 0, 0,
- (),
- "Return the number of ports in the port table. @code{pt-size}\n"
- "is only included in @code{--enable-guile-debug} builds.")
-#define FUNC_NAME s_scm_pt_size
-{
- return scm_from_int (SCM_HASHTABLE_N_ITEMS (scm_i_port_weak_hash));
-}
-#undef FUNC_NAME
-#endif
-
void
scm_port_non_buffer (scm_t_port *pt)
{
rv = (scm_ptobs[i].close) (port);
else
rv = 0;
- scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
scm_i_remove_port (port);
- scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
SCM_CLR_PORT_OPEN_FLAG (port);
return scm_from_bool (rv >= 0);
}
}
#undef FUNC_NAME
+struct for_each_data
+{
+ void (*proc) (void *data, SCM p);
+ void *data;
+};
+
static SCM
-scm_i_collect_keys_in_vector (void *closure, SCM key, SCM value, SCM result)
+for_each_trampoline (void *data, SCM port, SCM result)
{
- int *i = (int*) closure;
- scm_c_vector_set_x (result, *i, key);
- (*i)++;
+ struct for_each_data *d = data;
+
+ d->proc (d->data, port);
return result;
}
void
scm_c_port_for_each (void (*proc)(void *data, SCM p), void *data)
{
- int i = 0;
- size_t n;
- SCM ports;
-
- /* Even without pre-emptive multithreading, running arbitrary code
- while scanning the port table is unsafe because the port table
- can change arbitrarily (from a GC, for example). So we first
- collect the ports into a vector. -mvo */
-
- scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
- n = SCM_HASHTABLE_N_ITEMS (scm_i_port_weak_hash);
- scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
- ports = scm_c_make_vector (n, SCM_BOOL_F);
-
- scm_i_pthread_mutex_lock (&scm_i_port_table_mutex);
- ports = scm_internal_hash_fold (scm_i_collect_keys_in_vector, &i,
- ports, scm_i_port_weak_hash);
- scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
-
- for (i = 0; i < n; i++) {
- SCM p = SCM_SIMPLE_VECTOR_REF (ports, i);
- if (SCM_PORTP (p))
- proc (data, p);
- }
+ struct for_each_data d;
+
+ d.proc = proc;
+ d.data = data;
- scm_remember_upto_here_1 (ports);
+ scm_c_weak_set_fold (for_each_trampoline, &d, SCM_EOL,
+ scm_i_port_weak_set);
+}
+
+static void
+scm_for_each_trampoline (void *data, SCM port)
+{
+ scm_call_1 (SCM_PACK_POINTER (data), port);
}
SCM_DEFINE (scm_port_for_each, "port-for-each", 1, 0, 0,
{
SCM_VALIDATE_PROC (1, proc);
- scm_c_port_for_each ((void (*)(void*,SCM))scm_call_1, proc);
+ scm_c_port_for_each (scm_for_each_trampoline, SCM_UNPACK_POINTER (proc));
+
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
(SCM port),
"Return the next character available from @var{port}, updating\n"
"@var{port} to point to the following character. If no more\n"
- "characters are available, the end-of-file object is returned.")
+ "characters are available, the end-of-file object is returned.\n"
+ "\n"
+ "When @var{port}'s data cannot be decoded according to its\n"
+ "character encoding, a @code{decoding-error} is raised and\n"
+ "@var{port} points past the erroneous byte sequence.\n")
#define FUNC_NAME s_scm_read_char
{
scm_t_wchar c;
}
#undef FUNC_NAME
+/* Update the line and column number of PORT after consumption of C. */
+static inline void
+update_port_lf (scm_t_wchar c, SCM port)
+{
+ switch (c)
+ {
+ case '\a':
+ case EOF:
+ break;
+ case '\b':
+ SCM_DECCOL (port);
+ break;
+ case '\n':
+ SCM_INCLINE (port);
+ break;
+ case '\r':
+ SCM_ZEROCOL (port);
+ break;
+ case '\t':
+ SCM_TABCOL (port);
+ break;
+ default:
+ SCM_INCCOL (port);
+ break;
+ }
+}
+
#define SCM_MBCHAR_BUF_SIZE (4)
-/* Get one codepoint from a file, using the port's encoding. */
-scm_t_wchar
-scm_getc (SCM port)
+/* Convert the SIZE-byte UTF-8 sequence in UTF8_BUF to a codepoint.
+ UTF8_BUF is assumed to contain a valid UTF-8 sequence. */
+static scm_t_wchar
+utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
{
- int c;
- unsigned int bufcount = 0;
- char buf[SCM_MBCHAR_BUF_SIZE];
- scm_t_wchar codepoint = 0;
- scm_t_uint32 *u32;
- size_t u32len;
- scm_t_port *pt = SCM_PTAB_ENTRY (port);
+ scm_t_wchar codepoint;
- c = scm_get_byte_or_eof (port);
- if (c == EOF)
- return (scm_t_wchar) EOF;
+ if (utf8_buf[0] <= 0x7f)
+ {
+ assert (size == 1);
+ codepoint = utf8_buf[0];
+ }
+ else if ((utf8_buf[0] & 0xe0) == 0xc0)
+ {
+ assert (size == 2);
+ codepoint = ((scm_t_wchar) utf8_buf[0] & 0x1f) << 6UL
+ | (utf8_buf[1] & 0x3f);
+ }
+ else if ((utf8_buf[0] & 0xf0) == 0xe0)
+ {
+ assert (size == 3);
+ codepoint = ((scm_t_wchar) utf8_buf[0] & 0x0f) << 12UL
+ | ((scm_t_wchar) utf8_buf[1] & 0x3f) << 6UL
+ | (utf8_buf[2] & 0x3f);
+ }
+ else
+ {
+ assert (size == 4);
+ codepoint = ((scm_t_wchar) utf8_buf[0] & 0x07) << 18UL
+ | ((scm_t_wchar) utf8_buf[1] & 0x3f) << 12UL
+ | ((scm_t_wchar) utf8_buf[2] & 0x3f) << 6UL
+ | (utf8_buf[3] & 0x3f);
+ }
+
+ return codepoint;
+}
- buf[0] = c;
- bufcount++;
+/* Read a UTF-8 sequence from PORT. On success, return 0 and set
+ *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8
+ representation, and set *LEN to the length in bytes. Return
+ `EILSEQ' on error. */
+static int
+get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
+ scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+#define ASSERT_NOT_EOF(b) \
+ if (SCM_UNLIKELY ((b) == EOF)) \
+ goto invalid_seq
+#define CONSUME_PEEKED_BYTE() \
+ pt->read_pos++
- if (pt->encoding == NULL)
- {
- /* The encoding is Latin-1: bytes are characters. */
- codepoint = (unsigned char) buf[0];
- goto success;
+ int byte;
+ scm_t_port *pt;
+
+ *len = 0;
+ pt = SCM_PTAB_ENTRY (port);
+
+ byte = scm_get_byte_or_eof (port);
+ if (byte == EOF)
+ {
+ *codepoint = EOF;
+ return 0;
}
- for (;;)
+ buf[0] = (scm_t_uint8) byte;
+ *len = 1;
+
+ if (buf[0] <= 0x7f)
+ /* 1-byte form. */
+ *codepoint = buf[0];
+ else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
{
- u32 = u32_conv_from_encoding (pt->encoding,
- (enum iconv_ilseq_handler) pt->ilseq_handler,
- buf, bufcount, NULL, NULL, &u32len);
- if (u32 == NULL || u32len == 0)
- {
- if (errno == ENOMEM)
- scm_memory_error ("Input decoding");
-
- /* Otherwise errno is EILSEQ or EINVAL, so perhaps more
- bytes are needed. Keep looping. */
- }
- else
- {
- /* Complete codepoint found. */
- codepoint = u32[0];
- free (u32);
- goto success;
- }
+ /* 2-byte form. */
+ byte = scm_peek_byte_or_eof (port);
+ ASSERT_NOT_EOF (byte);
- if (bufcount == SCM_MBCHAR_BUF_SIZE)
- {
- /* We've read several bytes and didn't find a good
- codepoint. Give up. */
- goto failure;
- }
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
+ | (buf[1] & 0x3f);
+ }
+ else if ((buf[0] & 0xf0) == 0xe0)
+ {
+ /* 3-byte form. */
+ byte = scm_peek_byte_or_eof (port);
+ ASSERT_NOT_EOF (byte);
- c = scm_get_byte_or_eof (port);
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
+ || (buf[0] == 0xe0 && byte < 0xa0)
+ || (buf[0] == 0xed && byte > 0x9f)))
+ goto invalid_seq;
- if (c == EOF)
+ CONSUME_PEEKED_BYTE ();
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ byte = scm_peek_byte_or_eof (port);
+ ASSERT_NOT_EOF (byte);
+
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[2] = (scm_t_uint8) byte;
+ *len = 3;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
+ | ((scm_t_wchar) buf[1] & 0x3f) << 6UL
+ | (buf[2] & 0x3f);
+ }
+ else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
+ {
+ /* 4-byte form. */
+ byte = scm_peek_byte_or_eof (port);
+ ASSERT_NOT_EOF (byte);
+
+ if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
+ || (buf[0] == 0xf0 && byte < 0x90)
+ || (buf[0] == 0xf4 && byte > 0x8f)))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[1] = (scm_t_uint8) byte;
+ *len = 2;
+
+ byte = scm_peek_byte_or_eof (port);
+ ASSERT_NOT_EOF (byte);
+
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[2] = (scm_t_uint8) byte;
+ *len = 3;
+
+ byte = scm_peek_byte_or_eof (port);
+ ASSERT_NOT_EOF (byte);
+
+ if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+ goto invalid_seq;
+
+ CONSUME_PEEKED_BYTE ();
+ buf[3] = (scm_t_uint8) byte;
+ *len = 4;
+
+ *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
+ | ((scm_t_wchar) buf[1] & 0x3f) << 12UL
+ | ((scm_t_wchar) buf[2] & 0x3f) << 6UL
+ | (buf[3] & 0x3f);
+ }
+ else
+ goto invalid_seq;
+
+ return 0;
+
+ invalid_seq:
+ /* Here we could choose the consume the faulty byte when it's not a
+ valid starting byte, but it's not a requirement. What Section 3.9
+ of Unicode 6.0.0 mandates, though, is to not consume a byte that
+ would otherwise be a valid starting byte. */
+
+ return EILSEQ;
+
+#undef CONSUME_PEEKED_BYTE
+#undef ASSERT_NOT_EOF
+}
+
+/* Likewise, read a byte sequence from PORT, passing it through its
+ input conversion descriptor. */
+static int
+get_iconv_codepoint (SCM port, scm_t_wchar *codepoint,
+ char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+ scm_t_port *pt;
+ int err, byte_read;
+ size_t bytes_consumed, output_size;
+ char *output;
+ scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
+
+ pt = SCM_PTAB_ENTRY (port);
+
+ for (output_size = 0, output = (char *) utf8_buf,
+ bytes_consumed = 0, err = 0;
+ err == 0 && output_size == 0
+ && (bytes_consumed == 0 || byte_read != EOF);
+ bytes_consumed++)
+ {
+ char *input;
+ size_t input_left, output_left, done;
+
+ byte_read = scm_get_byte_or_eof (port);
+ if (byte_read == EOF)
{
- /* EOF before a complete character was read. Push it all
- back and return EOF. */
- while (bufcount > 0)
+ if (bytes_consumed == 0)
{
- /* FIXME: this will probably cause errors in the port column. */
- scm_unget_byte (buf[bufcount-1], port);
- bufcount --;
+ *codepoint = (scm_t_wchar) EOF;
+ *len = 0;
+ return 0;
}
- return EOF;
+ else
+ continue;
}
-
- if (c == '\n')
+
+ buf[bytes_consumed] = byte_read;
+
+ input = buf;
+ input_left = bytes_consumed + 1;
+ output_left = sizeof (utf8_buf);
+
+ done = iconv (pt->input_cd, &input, &input_left,
+ &output, &output_left);
+ if (done == (size_t) -1)
{
- /* It is always invalid to have EOL in the middle of a
- multibyte character. */
- scm_unget_byte ('\n', port);
- goto failure;
+ err = errno;
+ if (err == EINVAL)
+ /* Missing input: keep trying. */
+ err = 0;
}
-
- buf[bufcount++] = c;
+ else
+ output_size = sizeof (utf8_buf) - output_left;
}
- success:
- switch (codepoint)
+ if (SCM_UNLIKELY (output_size == 0))
+ /* An unterminated sequence. */
+ err = EILSEQ;
+ else if (SCM_LIKELY (err == 0))
{
- case '\a':
- break;
- case '\b':
- SCM_DECCOL (port);
- break;
- case '\n':
- SCM_INCLINE (port);
- break;
- case '\r':
- SCM_ZEROCOL (port);
- break;
- case '\t':
- SCM_TABCOL (port);
- break;
- default:
- SCM_INCCOL (port);
- break;
+ /* Convert the UTF8_BUF sequence to a Unicode code point. */
+ *codepoint = utf8_to_codepoint (utf8_buf, output_size);
+ *len = bytes_consumed;
}
- return codepoint;
+ return err;
+}
- failure:
- {
- char *err_buf;
- SCM err_str = scm_i_make_string (bufcount, &err_buf);
- memcpy (err_buf, buf, bufcount);
-
- if (errno == EILSEQ)
- scm_misc_error (NULL, "input encoding error for ~s: ~s",
- scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
- err_str));
- else
- scm_misc_error (NULL, "input encoding error (invalid) for ~s: ~s\n",
- scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
- err_str));
- }
+/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF
+ with the byte representation of the codepoint in PORT's encoding, and
+ set *LEN to the length in bytes of that representation. Return 0 on
+ success and an errno value on error. */
+static int
+get_codepoint (SCM port, scm_t_wchar *codepoint,
+ char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+{
+ int err;
+ scm_t_port *pt = SCM_PTAB_ENTRY (port);
- /* Never gets here. */
- return 0;
+ if (pt->input_cd == (iconv_t) -1)
+ /* Initialize the conversion descriptors, if needed. */
+ scm_i_set_port_encoding_x (port, pt->encoding);
+
+ /* FIXME: In 2.1, add a flag to determine whether a port is UTF-8. */
+ if (pt->input_cd == (iconv_t) -1)
+ err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len);
+ else
+ err = get_iconv_codepoint (port, codepoint, buf, len);
+
+ if (SCM_LIKELY (err == 0))
+ update_port_lf (*codepoint, port);
+ else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
+ {
+ *codepoint = '?';
+ err = 0;
+ update_port_lf (*codepoint, port);
+ }
+
+ return err;
}
+/* Read a codepoint from PORT and return it. */
+scm_t_wchar
+scm_getc (SCM port)
+#define FUNC_NAME "scm_getc"
+{
+ int err;
+ size_t len;
+ scm_t_wchar codepoint;
+ char buf[SCM_MBCHAR_BUF_SIZE];
+
+ err = get_codepoint (port, &codepoint, buf, &len);
+ if (SCM_UNLIKELY (err != 0))
+ /* At this point PORT should point past the invalid encoding, as per
+ R6RS-lib Section 8.2.4. */
+ scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
+
+ return codepoint;
+}
+#undef FUNC_NAME
/* this should only be called when the read buffer is empty. it
tries to refill the read buffer. it returns the first char from
* This function differs from scm_c_write; it updates port line and
* column. */
-static void
-update_port_lf (scm_t_wchar c, SCM port)
-{
- if (c == '\a')
- ; /* Do nothing. */
- else if (c == '\b')
- SCM_DECCOL (port);
- else if (c == '\n')
- SCM_INCLINE (port);
- else if (c == '\r')
- SCM_ZEROCOL (port);
- else if (c == '\t')
- SCM_TABCOL (port);
- else
- SCM_INCCOL (port);
-}
-
void
scm_lfwrite (const char *ptr, size_t size, SCM port)
{
pt->rw_active = SCM_PORT_WRITE;
}
-/* Write a scheme string STR to PORT from START inclusive to END
- exclusive. */
+/* Write STR to PORT from START inclusive to END exclusive. */
void
scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
{
- size_t i, size = scm_i_string_length (str);
scm_t_port *pt = SCM_PTAB_ENTRY (port);
- scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)];
- scm_t_wchar p;
- char *buf;
- size_t len;
-
- if (pt->rw_active == SCM_PORT_READ)
- scm_end_input (port);
-
- if (end == (size_t) (-1))
- end = size;
- size = end - start;
-
- /* Note that making a substring will likely take the
- stringbuf_write_mutex. So, one shouldn't use scm_lfwrite_substr
- if the stringbuf write mutex may still be held elsewhere. */
- buf = scm_to_stringn (scm_c_substring (str, start, end), &len,
- pt->encoding, pt->ilseq_handler);
- ptob->write (port, buf, len);
- free (buf);
-
- for (i = 0; i < size; i++)
- {
- p = scm_i_string_ref (str, i + start);
- update_port_lf (p, port);
- }
-
- if (pt->rw_random)
- pt->rw_active = SCM_PORT_WRITE;
-}
-
-/* Write a scheme string STR to PORT. */
-void
-scm_lfwrite_str (SCM str, SCM port)
-{
- size_t i, size = scm_i_string_length (str);
- scm_t_port *pt = SCM_PTAB_ENTRY (port);
- scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)];
- scm_t_wchar p;
- char *buf;
- size_t len;
if (pt->rw_active == SCM_PORT_READ)
scm_end_input (port);
- buf = scm_to_stringn (str, &len,
- pt->encoding, pt->ilseq_handler);
- ptob->write (port, buf, len);
- free (buf);
+ if (end == (size_t) -1)
+ end = scm_i_string_length (str);
- for (i = 0; i < size; i++)
- {
- p = scm_i_string_ref (str, i);
- update_port_lf (p, port);
- }
+ scm_display (scm_c_substring (str, start, end), port);
if (pt->rw_random)
pt->rw_active = SCM_PORT_WRITE;
}
#undef FUNC_NAME
-void
+void
scm_ungetc (scm_t_wchar c, SCM port)
#define FUNC_NAME "scm_ungetc"
{
scm_t_port *pt = SCM_PTAB_ENTRY (port);
- scm_t_wchar *wbuf;
- SCM str = scm_i_make_wide_string (1, &wbuf);
- char *buf;
+ char *result;
+ char result_buf[10];
+ const char *encoding;
size_t len;
int i;
- wbuf[0] = c;
- buf = scm_to_stringn (str, &len, pt->encoding, pt->ilseq_handler);
-
+ if (pt->encoding != NULL)
+ encoding = pt->encoding;
+ else
+ encoding = "ISO-8859-1";
+
+ len = sizeof (result_buf);
+ result = u32_conv_to_encoding (encoding,
+ (enum iconv_ilseq_handler) pt->ilseq_handler,
+ (uint32_t *) &c, 1, NULL,
+ result_buf, &len);
+
+ if (SCM_UNLIKELY (result == NULL || len == 0))
+ scm_encoding_error (FUNC_NAME, errno,
+ "conversion to port encoding failed",
+ SCM_BOOL_F, SCM_MAKE_CHAR (c));
+
for (i = len - 1; i >= 0; i--)
- scm_unget_byte (buf[i], port);
+ scm_unget_byte (result[i], port);
+
+ if (SCM_UNLIKELY (result != result_buf))
+ free (result);
if (c == '\n')
{
"return the value returned by the preceding call to\n"
"@code{peek-char}. In particular, a call to @code{peek-char} on\n"
"an interactive port will hang waiting for input whenever a call\n"
- "to @code{read-char} would have hung.")
+ "to @code{read-char} would have hung.\n"
+ "\n"
+ "As for @code{read-char}, a @code{decoding-error} may be raised\n"
+ "if such a situation occurs. However, unlike with @code{read-char},\n"
+ "@var{port} still points at the beginning of the erroneous byte\n"
+ "sequence when the error is raised.\n")
#define FUNC_NAME s_scm_peek_char
{
- scm_t_wchar c, column;
+ int err;
+ SCM result;
+ scm_t_wchar c;
+ char bytes[SCM_MBCHAR_BUF_SIZE];
+ long column, line, i;
+ size_t len;
+
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
+ SCM_VALIDATE_OPINPORT (1, port);
+
+ column = SCM_COL (port);
+ line = SCM_LINUM (port);
+
+ err = get_codepoint (port, &c, bytes, &len);
+
+ for (i = len - 1; i >= 0; i--)
+ scm_unget_byte (bytes[i], port);
+
+ SCM_COL (port) = column;
+ SCM_LINUM (port) = line;
+
+ if (SCM_UNLIKELY (err != 0))
+ {
+ scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
+
+ /* Shouldn't happen since `catch' always aborts to prompt. */
+ result = SCM_BOOL_F;
+ }
+ else if (c == EOF)
+ result = SCM_EOF_VAL;
else
- SCM_VALIDATE_OPINPORT (1, port);
- column = SCM_COL(port);
- c = scm_getc (port);
- if (EOF == c)
- return SCM_EOF_VAL;
- scm_ungetc (c, port);
- SCM_COL(port) = column;
- return SCM_MAKE_CHAR (c);
+ result = SCM_MAKE_CHAR (c);
+
+ return result;
}
#undef FUNC_NAME
SCM_VALIDATE_CHAR (1, cobj);
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
- else
- SCM_VALIDATE_OPINPORT (2, port);
+ SCM_VALIDATE_OPINPORT (2, port);
c = SCM_CHAR (cobj);
SCM_VALIDATE_STRING (1, str);
if (SCM_UNBNDP (port))
port = scm_current_input_port ();
- else
- SCM_VALIDATE_OPINPORT (2, port);
+ SCM_VALIDATE_OPINPORT (2, port);
n = scm_i_string_length (str);
SCM_DEFINE (scm_port_filename, "port-filename", 1, 0, 0,
(SCM port),
- "Return the filename associated with @var{port}. This function returns\n"
- "the strings \"standard input\", \"standard output\" and \"standard error\"\n"
- "when called on the current input, output and error ports respectively.")
+ "Return the filename associated with @var{port}, or @code{#f}\n"
+ "if no filename is associated with the port.")
#define FUNC_NAME s_scm_port_filename
{
port = SCM_COERCE_OUTPORT (port);
static int scm_port_encoding_init = 0;
-/* Return a C string representation of the current encoding. */
+/* Use ENCODING as the default encoding for future ports. */
+void
+scm_i_set_default_port_encoding (const char *encoding)
+{
+ if (!scm_port_encoding_init
+ || !scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var)))
+ scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized",
+ SCM_EOL);
+
+ if (encoding == NULL
+ || !strcmp (encoding, "ASCII")
+ || !strcmp (encoding, "ANSI_X3.4-1968")
+ || !strcmp (encoding, "ISO-8859-1"))
+ scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F);
+ else
+ scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var),
+ scm_from_locale_string (encoding));
+}
+
+/* Return the name of the default encoding for newly created ports; a
+ return value of NULL means "ISO-8859-1". */
const char *
-scm_i_get_port_encoding (SCM port)
+scm_i_default_port_encoding (void)
{
- SCM encoding;
-
- if (scm_is_false (port))
- {
- if (!scm_port_encoding_init)
- return NULL;
- else if (!scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var)))
- return NULL;
- else
- {
- encoding = scm_fluid_ref (SCM_VARIABLE_REF (default_port_encoding_var));
- if (!scm_is_string (encoding))
- return NULL;
- else
- return scm_i_string_chars (encoding);
- }
- }
+ if (!scm_port_encoding_init)
+ return NULL;
+ else if (!scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var)))
+ return NULL;
else
{
- scm_t_port *pt;
- pt = SCM_PTAB_ENTRY (port);
- if (pt->encoding)
- return pt->encoding;
- else
+ SCM encoding;
+
+ encoding = scm_fluid_ref (SCM_VARIABLE_REF (default_port_encoding_var));
+ if (!scm_is_string (encoding))
return NULL;
+ else
+ return scm_i_string_chars (encoding);
}
}
-/* Returns ENC if it is a recognized encoding. If it isn't, it tries
- to find an alias of ENC that is valid. Otherwise, it returns
- NULL. */
-static const char *
-find_valid_encoding (const char *enc)
-{
- int isvalid = 0;
- const char str[] = " ";
- scm_t_uint32 *u32;
- size_t u32len;
-
- u32 = u32_conv_from_encoding (enc, iconveh_error, str, 1,
- NULL, NULL, &u32len);
- isvalid = (u32 != NULL);
- free (u32);
-
- if (isvalid)
- return enc;
-
- return NULL;
-}
-
void
-scm_i_set_port_encoding_x (SCM port, const char *enc)
+scm_i_set_port_encoding_x (SCM port, const char *encoding)
{
- const char *valid_enc;
scm_t_port *pt;
+ iconv_t new_input_cd, new_output_cd;
- /* Null is shorthand for the native, Latin-1 encoding. */
- if (enc == NULL)
- valid_enc = NULL;
- else
- {
- valid_enc = find_valid_encoding (enc);
- if (valid_enc == NULL)
- {
- SCM err;
- err = scm_from_locale_string (enc);
- scm_misc_error (NULL, "invalid or unknown character encoding ~s",
- scm_list_1 (err));
- }
- }
+ new_input_cd = (iconv_t) -1;
+ new_output_cd = (iconv_t) -1;
- if (scm_is_false (port))
- {
- /* Set the default encoding for future ports. */
- if (!scm_port_encoding_init
- || !scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var)))
- scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized",
- SCM_EOL);
+ /* Set the character encoding for this port. */
+ pt = SCM_PTAB_ENTRY (port);
- if (valid_enc == NULL
- || !strcmp (valid_enc, "ASCII")
- || !strcmp (valid_enc, "ANSI_X3.4-1968")
- || !strcmp (valid_enc, "ISO-8859-1"))
- scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F);
- else
- scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var),
- scm_from_locale_string (valid_enc));
- }
- else
+ if (encoding == NULL)
+ encoding = "ISO-8859-1";
+
+ if (pt->encoding != encoding)
+ pt->encoding = scm_gc_strdup (encoding, "port");
+
+ /* If ENCODING is UTF-8, then no conversion descriptor is opened
+ because we do I/O ourselves. This saves 100+ KiB for each
+ descriptor. */
+ if (strcmp (encoding, "UTF-8"))
{
- /* Set the character encoding for this port. */
- pt = SCM_PTAB_ENTRY (port);
- if (valid_enc == NULL)
- pt->encoding = NULL;
- else
- pt->encoding = scm_gc_strdup (valid_enc, "port");
+ if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
+ {
+ /* Open an input iconv conversion descriptor, from ENCODING
+ to UTF-8. We choose UTF-8, not UTF-32, because iconv
+ implementations can typically convert from anything to
+ UTF-8, but not to UTF-32 (see
+ <http://lists.gnu.org/archive/html/bug-libunistring/2010-09/msg00007.html>). */
+ new_input_cd = iconv_open ("UTF-8", encoding);
+ if (new_input_cd == (iconv_t) -1)
+ goto invalid_encoding;
+ }
+
+ if (SCM_CELL_WORD_0 (port) & SCM_WRTNG)
+ {
+ new_output_cd = iconv_open (encoding, "UTF-8");
+ if (new_output_cd == (iconv_t) -1)
+ {
+ if (new_input_cd != (iconv_t) -1)
+ iconv_close (new_input_cd);
+ goto invalid_encoding;
+ }
+ }
}
+
+ if (pt->input_cd != (iconv_t) -1)
+ iconv_close (pt->input_cd);
+ if (pt->output_cd != (iconv_t) -1)
+ iconv_close (pt->output_cd);
+
+ pt->input_cd = new_input_cd;
+ pt->output_cd = new_output_cd;
+
+ return;
+
+ invalid_encoding:
+ {
+ SCM err;
+ err = scm_from_locale_string (encoding);
+ scm_misc_error ("scm_i_set_port_encoding_x",
+ "invalid or unknown character encoding ~s",
+ scm_list_1 (err));
+ }
}
SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0,
SCM_VALIDATE_PORT (1, port);
pt = SCM_PTAB_ENTRY (port);
- enc = scm_i_get_port_encoding (port);
+ enc = pt->encoding;
if (enc)
return scm_from_locale_string (pt->encoding);
else
- return scm_from_locale_string ("NONE");
+ return SCM_BOOL_F;
}
#undef FUNC_NAME
#define FUNC_NAME s_scm_set_port_encoding_x
{
char *enc_str;
- const char *valid_enc_str;
SCM_VALIDATE_PORT (1, port);
SCM_VALIDATE_STRING (2, enc);
enc_str = scm_to_locale_string (enc);
- valid_enc_str = find_valid_encoding (enc_str);
- if (valid_enc_str == NULL)
- {
- free (enc_str);
- scm_misc_error (FUNC_NAME, "invalid or unknown character encoding ~s",
- scm_list_1 (enc));
- }
- else
- {
- scm_i_set_port_encoding_x (port, valid_enc_str);
- free (enc_str);
- }
+ scm_i_set_port_encoding_x (port, enc_str);
+ free (enc_str);
+
return SCM_UNSPECIFIED;
}
#undef FUNC_NAME
{
scm_t_port *pt;
pt = SCM_PTAB_ENTRY (port);
- return pt->ilseq_handler;
+ return pt->ilseq_handler;
}
}
if (scm_is_false (port))
{
/* Set the default encoding for future ports. */
- if (!scm_conversion_strategy
+ if (!scm_conversion_strategy_init
|| !scm_is_fluid (SCM_VARIABLE_REF (scm_conversion_strategy)))
scm_misc_error (NULL, "tried to set conversion strategy fluid before it is initialized",
SCM_EOL);
h = scm_i_get_conversion_strategy (port);
if (h == SCM_FAILED_CONVERSION_ERROR)
- return scm_from_locale_symbol ("error");
+ return scm_from_latin1_symbol ("error");
else if (h == SCM_FAILED_CONVERSION_QUESTION_MARK)
- return scm_from_locale_symbol ("substitute");
+ return scm_from_latin1_symbol ("substitute");
else if (h == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE)
- return scm_from_locale_symbol ("escape");
+ return scm_from_latin1_symbol ("escape");
else
abort ();
SCM_VALIDATE_OPPORT (1, port);
}
- err = scm_from_locale_symbol ("error");
+ err = scm_from_latin1_symbol ("error");
if (scm_is_true (scm_eqv_p (sym, err)))
{
scm_i_set_conversion_strategy_x (port, SCM_FAILED_CONVERSION_ERROR);
return SCM_UNSPECIFIED;
}
- qm = scm_from_locale_symbol ("substitute");
+ qm = scm_from_latin1_symbol ("substitute");
if (scm_is_true (scm_eqv_p (sym, qm)))
{
scm_i_set_conversion_strategy_x (port,
return SCM_UNSPECIFIED;
}
- esc = scm_from_locale_symbol ("escape");
+ esc = scm_from_latin1_symbol ("escape");
if (scm_is_true (scm_eqv_p (sym, esc)))
{
scm_i_set_conversion_strategy_x (port,
return 1;
}
-void
-scm_ports_prehistory ()
-{
- scm_numptob = 0;
- scm_ptobs = NULL;
-}
-
\f
/* Void ports. */
static SCM
scm_i_void_port (long mode_bits)
{
- scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex);
- {
- SCM answer = scm_new_port_table_entry (scm_tc16_void_port);
- scm_t_port * pt = SCM_PTAB_ENTRY(answer);
+ SCM ret;
- scm_port_non_buffer (pt);
+ ret = scm_c_make_port (scm_tc16_void_port, mode_bits, 0);
+
+ scm_port_non_buffer (SCM_PTAB_ENTRY (ret));
- SCM_SETSTREAM (answer, 0);
- SCM_SET_CELL_TYPE (answer, scm_tc16_void_port | mode_bits);
- scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex);
- return answer;
- }
+ return ret;
}
SCM
scm_tc16_void_port = scm_make_port_type ("void", fill_input_void_port,
write_void_port);
- cur_inport_fluid = scm_permanent_object (scm_make_fluid ());
- cur_outport_fluid = scm_permanent_object (scm_make_fluid ());
- cur_errport_fluid = scm_permanent_object (scm_make_fluid ());
- cur_loadport_fluid = scm_permanent_object (scm_make_fluid ());
+ cur_inport_fluid = scm_make_fluid ();
+ cur_outport_fluid = scm_make_fluid ();
+ cur_errport_fluid = scm_make_fluid ();
+ cur_loadport_fluid = scm_make_fluid ();
- scm_i_port_weak_hash = scm_permanent_object (scm_make_weak_key_hash_table (SCM_I_MAKINUM(31)));
+ scm_i_port_weak_set = scm_c_make_weak_set (31);
#include "libguile/ports.x"