X-Git-Url: https://git.hcoop.net/bpt/guile.git/blobdiff_plain/b07992900bef24596532018d7939384f8632cd0f..60617d819d77a1b92ed6c557a0b49b8e9a8e97b9:/libguile/ports.c diff --git a/libguile/ports.c b/libguile/ports.c index 4ed5f76d7..720ffc1b5 100644 --- a/libguile/ports.c +++ b/libguile/ports.c @@ -1,5 +1,6 @@ -/* Copyright (C) 1995,1996,1997,1998,1999,2000,2001, 2003, 2004, 2006, 2007, 2008, 2009 Free Software Foundation, Inc. - * +/* Copyright (C) 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2003, 2004, 2006, + * 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014 Free Software Foundation, Inc. + * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License * as published by the Free Software Foundation; either version 3 of @@ -29,13 +30,20 @@ #include #include #include /* for chsize on mingw */ +#include +#include +#include +#include +#include +#include + #include #include "libguile/_scm.h" #include "libguile/async.h" +#include "libguile/deprecation.h" #include "libguile/eval.h" #include "libguile/fports.h" /* direct access for seek and truncate */ -#include "libguile/objects.h" #include "libguile/goops.h" #include "libguile/smob.h" #include "libguile/chars.h" @@ -48,18 +56,17 @@ #include "libguile/mallocs.h" #include "libguile/validate.h" #include "libguile/ports.h" +#include "libguile/ports-internal.h" #include "libguile/vectors.h" #include "libguile/weaks.h" #include "libguile/fluids.h" +#include "libguile/eq.h" +#include "libguile/alist.h" #ifdef HAVE_STRING_H #include #endif -#ifdef HAVE_MALLOC_H -#include -#endif - #ifdef HAVE_IO_H #include #endif @@ -79,7 +86,7 @@ gnulib ftruncate.c has code using fcntl F_CHSIZE and F_FREESP, which might be possibilities if we've got other systems without ftruncate. */ -#if HAVE_CHSIZE && ! HAVE_FTRUNCATE +#if defined HAVE_CHSIZE && ! defined HAVE_FTRUNCATE #define ftruncate(fd, size) chsize (fd, size) #undef HAVE_FTRUNCATE #define HAVE_FTRUNCATE 1 @@ -94,8 +101,8 @@ * Indexes into this table are used when generating type * tags for smobjects (if you know a tag you can get an index and conversely). */ -scm_t_ptob_descriptor *scm_ptobs; -long scm_numptob; +scm_t_ptob_descriptor *scm_ptobs = NULL; +long scm_numptob = 0; /* GC marker for a port with stream of SCM type. */ SCM @@ -125,12 +132,6 @@ end_input_default (SCM port SCM_UNUSED, int offset SCM_UNUSED) { } -static size_t -scm_port_free0 (SCM port) -{ - return 0; -} - scm_t_bits scm_make_port_type (char *name, int (*fill_input) (SCM port), @@ -140,16 +141,18 @@ scm_make_port_type (char *name, if (SCM_I_MAX_PORT_TYPE_COUNT - 1 <= scm_numptob) goto ptoberr; SCM_CRITICAL_SECTION_START; - SCM_SYSCALL (tmp = (char *) realloc ((char *) scm_ptobs, - (1 + scm_numptob) - * sizeof (scm_t_ptob_descriptor))); + tmp = (char *) scm_gc_realloc ((char *) scm_ptobs, + scm_numptob * sizeof (scm_t_ptob_descriptor), + (1 + scm_numptob) + * sizeof (scm_t_ptob_descriptor), + "port-type"); if (tmp) { scm_ptobs = (scm_t_ptob_descriptor *) tmp; scm_ptobs[scm_numptob].name = name; scm_ptobs[scm_numptob].mark = 0; - scm_ptobs[scm_numptob].free = scm_port_free0; + scm_ptobs[scm_numptob].free = NULL; scm_ptobs[scm_numptob].print = scm_port_print; scm_ptobs[scm_numptob].equalp = 0; scm_ptobs[scm_numptob].close = 0; @@ -240,6 +243,42 @@ scm_set_port_input_waiting (scm_t_bits tc, int (*input_waiting) (SCM)) scm_ptobs[SCM_TC2PTOBNUM (tc)].input_waiting = input_waiting; } +static void +scm_i_set_pending_eof (SCM port) +{ + SCM_PORT_GET_INTERNAL (port)->pending_eof = 1; +} + +static void +scm_i_clear_pending_eof (SCM port) +{ + SCM_PORT_GET_INTERNAL (port)->pending_eof = 0; +} + +SCM_DEFINE (scm_i_port_property, "%port-property", 2, 0, 0, + (SCM port, SCM key), + "Return the property of @var{port} associated with @var{key}.") +#define FUNC_NAME s_scm_i_port_property +{ + SCM_VALIDATE_OPPORT (1, port); + return scm_assq_ref (SCM_PORT_GET_INTERNAL (port)->alist, key); +} +#undef FUNC_NAME + +SCM_DEFINE (scm_i_set_port_property_x, "%set-port-property!", 3, 0, 0, + (SCM port, SCM key, SCM value), + "Set the property of @var{port} associated with @var{key} to @var{value}.") +#define FUNC_NAME s_scm_i_set_port_property_x +{ + scm_t_port_internal *pti; + + SCM_VALIDATE_OPPORT (1, port); + pti = SCM_PORT_GET_INTERNAL (port); + pti->alist = scm_assq_set_x (pti->alist, key, value); + return SCM_UNSPECIFIED; +} +#undef FUNC_NAME + SCM_DEFINE (scm_char_ready_p, "char-ready?", 0, 1, 0, @@ -264,8 +303,9 @@ SCM_DEFINE (scm_char_ready_p, "char-ready?", 0, 1, 0, if (SCM_UNBNDP (port)) port = scm_current_input_port (); - else - SCM_VALIDATE_OPINPORT (1, port); + /* It's possible to close the current input port, so validate even in + this case. */ + SCM_VALIDATE_OPINPORT (1, port); pt = SCM_PTAB_ENTRY (port); @@ -288,19 +328,21 @@ SCM_DEFINE (scm_char_ready_p, "char-ready?", 0, 1, 0, } #undef FUNC_NAME -/* move up to read_len chars from port's putback and/or read buffers - into memory starting at dest. returns the number of chars moved. */ -size_t scm_take_from_input_buffers (SCM port, char *dest, size_t read_len) +/* Move up to READ_LEN bytes from PORT's putback and/or read buffers + into memory starting at DEST. Return the number of bytes moved. + PORT's line/column numbers are left unchanged. */ +size_t +scm_take_from_input_buffers (SCM port, char *dest, size_t read_len) { scm_t_port *pt = SCM_PTAB_ENTRY (port); - size_t chars_read = 0; + size_t bytes_read = 0; size_t from_buf = min (pt->read_end - pt->read_pos, read_len); if (from_buf > 0) { memcpy (dest, pt->read_pos, from_buf); pt->read_pos += from_buf; - chars_read += from_buf; + bytes_read += from_buf; read_len -= from_buf; dest += from_buf; } @@ -313,10 +355,11 @@ size_t scm_take_from_input_buffers (SCM port, char *dest, size_t read_len) { memcpy (dest, pt->saved_read_pos, from_buf); pt->saved_read_pos += from_buf; - chars_read += from_buf; + bytes_read += from_buf; } } - return chars_read; + + return bytes_read; } /* Clear a port's read buffers, returning the contents. */ @@ -349,8 +392,14 @@ SCM_DEFINE (scm_drain_input, "drain-input", 1, 0, 0, if (pt->read_buf == pt->putback_buf) count += pt->saved_read_end - pt->saved_read_pos; - result = scm_i_make_string (count, &data); - scm_take_from_input_buffers (port, data, count); + if (count) + { + result = scm_i_make_string (count, &data, 0); + scm_take_from_input_buffers (port, data, count); + } + else + result = scm_nullstr; + return result; } #undef FUNC_NAME @@ -358,10 +407,10 @@ SCM_DEFINE (scm_drain_input, "drain-input", 1, 0, 0, /* Standard ports --- current input, output, error, and more(!). */ -static SCM cur_inport_fluid; -static SCM cur_outport_fluid; -static SCM cur_errport_fluid; -static SCM cur_loadport_fluid; +static SCM cur_inport_fluid = SCM_BOOL_F; +static SCM cur_outport_fluid = SCM_BOOL_F; +static SCM cur_errport_fluid = SCM_BOOL_F; +static SCM cur_loadport_fluid = SCM_BOOL_F; SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0, (), @@ -370,7 +419,10 @@ SCM_DEFINE (scm_current_input_port, "current-input-port", 0, 0, 0, "returns the @dfn{standard input} in Unix and C terminology.") #define FUNC_NAME s_scm_current_input_port { - return scm_fluid_ref (cur_inport_fluid); + if (scm_is_true (cur_inport_fluid)) + return scm_fluid_ref (cur_inport_fluid); + else + return SCM_BOOL_F; } #undef FUNC_NAME @@ -382,7 +434,10 @@ SCM_DEFINE (scm_current_output_port, "current-output-port", 0, 0, 0, "Unix and C terminology.") #define FUNC_NAME s_scm_current_output_port { - return scm_fluid_ref (cur_outport_fluid); + if (scm_is_true (cur_outport_fluid)) + return scm_fluid_ref (cur_outport_fluid); + else + return SCM_BOOL_F; } #undef FUNC_NAME @@ -392,10 +447,31 @@ SCM_DEFINE (scm_current_error_port, "current-error-port", 0, 0, 0, "@dfn{standard error} in Unix and C terminology).") #define FUNC_NAME s_scm_current_error_port { - return scm_fluid_ref (cur_errport_fluid); + if (scm_is_true (cur_errport_fluid)) + return scm_fluid_ref (cur_errport_fluid); + else + return SCM_BOOL_F; } #undef FUNC_NAME +static SCM current_warning_port_var; +static scm_i_pthread_once_t current_warning_port_once = SCM_I_PTHREAD_ONCE_INIT; + +static void +init_current_warning_port_var (void) +{ + current_warning_port_var + = scm_c_private_variable ("guile", "current-warning-port"); +} + +SCM +scm_current_warning_port (void) +{ + scm_i_pthread_once (¤t_warning_port_once, + init_current_warning_port_var); + return scm_call_0 (scm_variable_ref (current_warning_port_var)); +} + SCM_DEFINE (scm_current_load_port, "current-load-port", 0, 0, 0, (), "Return the current-load-port.\n" @@ -450,6 +526,16 @@ SCM_DEFINE (scm_set_current_error_port, "set-current-error-port", 1, 0, 0, } #undef FUNC_NAME + +SCM +scm_set_current_warning_port (SCM port) +{ + scm_i_pthread_once (¤t_warning_port_once, + init_current_warning_port_var); + return scm_call_1 (scm_variable_ref (current_warning_port_var), port); +} + + void scm_dynwind_current_input_port (SCM port) #define FUNC_NAME NULL @@ -496,8 +582,60 @@ SCM scm_i_port_weak_hash; scm_i_pthread_mutex_t scm_i_port_table_mutex = SCM_I_PTHREAD_MUTEX_INITIALIZER; -/* This function is not and should not be thread safe. */ + +/* Port finalization. */ + + +static void finalize_port (void *, void *); + +/* Register a finalizer for PORT. */ +static SCM_C_INLINE_KEYWORD void +register_finalizer_for_port (SCM port) +{ + /* Register a finalizer for PORT so that its + type's `free' function gets called. */ + scm_i_set_finalizer (SCM2PTR (port), finalize_port, NULL); +} + +/* Finalize the object (a port) pointed to by PTR. */ +static void +finalize_port (void *ptr, void *data) +{ + long port_type; + SCM port = PTR2SCM (ptr); + + if (!SCM_PORTP (port)) + abort (); + + if (SCM_OPENP (port)) + { + if (SCM_REVEALED (port) > 0) + /* Keep "revealed" ports alive and re-register a finalizer. */ + register_finalizer_for_port (port); + else + { + port_type = SCM_TC2PTOBNUM (SCM_CELL_TYPE (port)); + if (port_type >= scm_numptob) + abort (); + + if (scm_ptobs[port_type].free) + /* Yes, I really do mean `.free' rather than `.close'. `.close' + is for explicit `close-port' by user. */ + scm_ptobs[port_type].free (port); + + SCM_SETSTREAM (port, 0); + SCM_CLR_PORT_OPEN_FLAG (port); + + scm_gc_ports_collected++; + } + } +} + + + + +/* This function is not and should not be thread safe. */ SCM scm_new_port_table_entry (scm_t_bits tag) #define FUNC_NAME "scm_new_port_table_entry" @@ -508,32 +646,74 @@ scm_new_port_table_entry (scm_t_bits tag) */ SCM z = scm_cons (SCM_EOL, SCM_EOL); - scm_t_port *entry = (scm_t_port *) scm_gc_calloc (sizeof (scm_t_port), "port"); + scm_t_port *entry = scm_gc_typed_calloc (scm_t_port); + scm_t_port_internal *pti = scm_gc_typed_calloc (scm_t_port_internal); + const char *encoding; entry->file_name = SCM_BOOL_F; entry->rw_active = SCM_PORT_NEITHER; entry->port = z; + /* Initialize this port with the thread's current default + encoding. */ + encoding = scm_i_default_port_encoding (); + entry->ilseq_handler = scm_i_default_port_conversion_handler (); + entry->encoding = encoding ? scm_gc_strdup (encoding, "port") : NULL; + if (encoding && c_strcasecmp (encoding, "UTF-8") == 0) + pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8; + else + pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV; + pti->iconv_descriptors = NULL; + + pti->at_stream_start_for_bom_read = 1; + pti->at_stream_start_for_bom_write = 1; + + /* XXX These fields are not what they seem. They have been + repurposed, but cannot safely be renamed in 2.0 without breaking + ABI compatibility. This will be cleaned up in 2.2. */ + entry->input_cd = pti; /* XXX pointer to the internal port structure */ + entry->output_cd = NULL; /* XXX unused */ + + pti->pending_eof = 0; + pti->alist = SCM_EOL; + + /* Until Guile 2.0.9 included, 'setvbuf' would only work on file + ports. Now all port types can be supported, but it's not clear + that port types out in wild accept having someone else fiddle with + their buffer. Thus, conservatively turn it off by default. */ + pti->setvbuf = NULL; + SCM_SET_CELL_TYPE (z, tag); SCM_SETPTAB_ENTRY (z, entry); scm_hashq_set_x (scm_i_port_weak_hash, z, SCM_BOOL_F); + /* For each new port, register a finalizer so that it port type's free + function can be invoked eventually. */ + register_finalizer_for_port (z); + return z; } #undef FUNC_NAME #if SCM_ENABLE_DEPRECATED==1 -SCM_API scm_t_port * +scm_t_port * scm_add_to_port_table (SCM port) { - SCM z = scm_new_port_table_entry (scm_tc7_port); - scm_t_port * pt = SCM_PTAB_ENTRY(z); + SCM z; + scm_t_port * pt; + + scm_c_issue_deprecation_warning ("scm_add_to_port_table is deprecated."); + scm_i_pthread_mutex_lock (&scm_i_port_table_mutex); + z = scm_new_port_table_entry (scm_tc7_port); + pt = SCM_PTAB_ENTRY(z); pt->port = port; SCM_SETCAR (z, SCM_EOL); SCM_SETCDR (z, SCM_EOL); SCM_SETPTAB_ENTRY (port, pt); + scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex); + return pt; } #endif @@ -541,18 +721,34 @@ scm_add_to_port_table (SCM port) /* Remove a port from the table and destroy it. */ -/* This function is not and should not be thread safe. */ -void +static void close_iconv_descriptors (scm_t_iconv_descriptors *id); + +static void scm_i_remove_port (SCM port) #define FUNC_NAME "scm_remove_port" { - scm_t_port *p = SCM_PTAB_ENTRY (port); - if (p->putback_buf) - scm_gc_free (p->putback_buf, p->putback_buf_size, "putback buffer"); - scm_gc_free (p, sizeof (scm_t_port), "port"); + scm_t_port *p; + scm_t_port_internal *pti; + + scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex); + + p = SCM_PTAB_ENTRY (port); + pti = SCM_PORT_GET_INTERNAL (port); + scm_port_non_buffer (p); + p->putback_buf = NULL; + p->putback_buf_size = 0; + + if (pti->iconv_descriptors) + { + close_iconv_descriptors (pti->iconv_descriptors); + pti->iconv_descriptors = NULL; + } SCM_SETPTAB_ENTRY (port, 0); + scm_hashq_remove_x (scm_i_port_weak_hash, port); + + scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex); } #undef FUNC_NAME @@ -632,21 +828,22 @@ SCM_DEFINE (scm_set_port_revealed_x, "set-port-revealed!", 2, 0, 0, */ static long -scm_i_mode_bits_n (const char *modes, size_t n) +scm_i_mode_bits_n (SCM modes) { return (SCM_OPN - | (memchr (modes, 'r', n) || memchr (modes, '+', n) ? SCM_RDNG : 0) - | ( memchr (modes, 'w', n) - || memchr (modes, 'a', n) - || memchr (modes, '+', n) ? SCM_WRTNG : 0) - | (memchr (modes, '0', n) ? SCM_BUF0 : 0) - | (memchr (modes, 'l', n) ? SCM_BUFLINE : 0)); + | (scm_i_string_contains_char (modes, 'r') + || scm_i_string_contains_char (modes, '+') ? SCM_RDNG : 0) + | (scm_i_string_contains_char (modes, 'w') + || scm_i_string_contains_char (modes, 'a') + || scm_i_string_contains_char (modes, '+') ? SCM_WRTNG : 0) + | (scm_i_string_contains_char (modes, '0') ? SCM_BUF0 : 0) + | (scm_i_string_contains_char (modes, 'l') ? SCM_BUFLINE : 0)); } long scm_mode_bits (char *modes) { - return scm_i_mode_bits_n (modes, strlen (modes)); + return scm_i_mode_bits (scm_from_locale_string (modes)); } long @@ -657,8 +854,7 @@ scm_i_mode_bits (SCM modes) if (!scm_is_string (modes)) scm_wrong_type_arg_msg (NULL, 0, modes, "string"); - bits = scm_i_mode_bits_n (scm_i_string_chars (modes), - scm_i_string_length (modes)); + bits = scm_i_mode_bits_n (modes); scm_remember_upto_here_1 (modes); return bits; } @@ -725,9 +921,7 @@ SCM_DEFINE (scm_close_port, "close-port", 1, 0, 0, rv = (scm_ptobs[i].close) (port); else rv = 0; - scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex); scm_i_remove_port (port); - scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex); SCM_CLR_PORT_OPEN_FLAG (port); return scm_from_bool (rv >= 0); } @@ -765,44 +959,29 @@ SCM_DEFINE (scm_close_output_port, "close-output-port", 1, 0, 0, #undef FUNC_NAME static SCM -scm_i_collect_keys_in_vector (void *closure, SCM key, SCM value, SCM result) +collect_keys (void *unused, SCM key, SCM value, SCM result) { - int *i = (int*) closure; - scm_c_vector_set_x (result, *i, key); - (*i)++; - - return result; + return scm_cons (key, result); } void scm_c_port_for_each (void (*proc)(void *data, SCM p), void *data) { - int i = 0; - size_t n; SCM ports; - /* Even without pre-emptive multithreading, running arbitrary code - while scanning the port table is unsafe because the port table - can change arbitrarily (from a GC, for example). So we first - collect the ports into a vector. -mvo */ - - scm_i_scm_pthread_mutex_lock (&scm_i_port_table_mutex); - n = SCM_HASHTABLE_N_ITEMS (scm_i_port_weak_hash); - scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex); - ports = scm_c_make_vector (n, SCM_BOOL_F); - + /* Copy out the port table as a list so that we get strong references + to all the values. */ scm_i_pthread_mutex_lock (&scm_i_port_table_mutex); - ports = scm_internal_hash_fold (scm_i_collect_keys_in_vector, &i, - ports, scm_i_port_weak_hash); + ports = scm_internal_hash_fold (collect_keys, NULL, + SCM_EOL, scm_i_port_weak_hash); scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex); - for (i = 0; i < n; i++) { - SCM p = SCM_SIMPLE_VECTOR_REF (ports, i); - if (SCM_PORTP (p)) - proc (data, p); - } - - scm_remember_upto_here_1 (ports); + for (; scm_is_pair (ports); ports = scm_cdr (ports)) + { + SCM p = scm_car (ports); + if (SCM_PORTP (p)) + proc (data, p); + } } SCM_DEFINE (scm_port_for_each, "port-for-each", 1, 0, 0, @@ -810,14 +989,26 @@ SCM_DEFINE (scm_port_for_each, "port-for-each", 1, 0, 0, "Apply @var{proc} to each port in the Guile port table\n" "in turn. The return value is unspecified. More specifically,\n" "@var{proc} is applied exactly once to every port that exists\n" - "in the system at the time @var{port-for-each} is invoked.\n" - "Changes to the port table while @var{port-for-each} is running\n" - "have no effect as far as @var{port-for-each} is concerned.") + "in the system at the time @code{port-for-each} is invoked.\n" + "Changes to the port table while @code{port-for-each} is running\n" + "have no effect as far as @code{port-for-each} is concerned.") #define FUNC_NAME s_scm_port_for_each { + SCM ports; + SCM_VALIDATE_PROC (1, proc); - scm_c_port_for_each ((void (*)(void*,SCM))scm_call_1, proc); + /* Copy out the port table as a list so that we get strong references + to all the values. */ + scm_i_pthread_mutex_lock (&scm_i_port_table_mutex); + ports = scm_internal_hash_fold (collect_keys, NULL, + SCM_EOL, scm_i_port_weak_hash); + scm_i_pthread_mutex_unlock (&scm_i_port_table_mutex); + + for (; scm_is_pair (ports); ports = scm_cdr (ports)) + if (SCM_PORTP (SCM_CAR (ports))) + scm_call_1 (proc, SCM_CAR (ports)); + return SCM_UNSPECIFIED; } #undef FUNC_NAME @@ -926,10 +1117,14 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0, (SCM port), "Return the next character available from @var{port}, updating\n" "@var{port} to point to the following character. If no more\n" - "characters are available, the end-of-file object is returned.") + "characters are available, the end-of-file object is returned.\n" + "\n" + "When @var{port}'s data cannot be decoded according to its\n" + "character encoding, a @code{decoding-error} is raised and\n" + "@var{port} points past the erroneous byte sequence.\n") #define FUNC_NAME s_scm_read_char { - int c; + scm_t_wchar c; if (SCM_UNBNDP (port)) port = scm_current_input_port (); SCM_VALIDATE_OPINPORT (1, port); @@ -940,160 +1135,526 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0, } #undef FUNC_NAME -/* this should only be called when the read buffer is empty. it - tries to refill the read buffer. it returns the first char from - the port, which is either EOF or *(pt->read_pos). */ -int -scm_fill_input (SCM port) +/* Update the line and column number of PORT after consumption of C. */ +static inline void +update_port_lf (scm_t_wchar c, SCM port) { - scm_t_port *pt = SCM_PTAB_ENTRY (port); - - assert (pt->read_pos == pt->read_end); - - if (pt->read_buf == pt->putback_buf) + switch (c) { - /* finished reading put-back chars. */ - pt->read_buf = pt->saved_read_buf; - pt->read_pos = pt->saved_read_pos; - pt->read_end = pt->saved_read_end; - pt->read_buf_size = pt->saved_read_buf_size; - if (pt->read_pos < pt->read_end) - return *(pt->read_pos); + case '\a': + case EOF: + break; + case '\b': + SCM_DECCOL (port); + break; + case '\n': + SCM_INCLINE (port); + break; + case '\r': + SCM_ZEROCOL (port); + break; + case '\t': + SCM_TABCOL (port); + break; + default: + SCM_INCCOL (port); + break; } - return scm_ptobs[SCM_PTOBNUM (port)].fill_input (port); } +#define SCM_MBCHAR_BUF_SIZE (4) -/* scm_lfwrite - * - * This function differs from scm_c_write; it updates port line and - * column. */ - -static void -update_port_lf (scm_t_wchar c, SCM port) +/* Convert the SIZE-byte UTF-8 sequence in UTF8_BUF to a codepoint. + UTF8_BUF is assumed to contain a valid UTF-8 sequence. */ +static scm_t_wchar +utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size) { - if (c == '\a') - ; /* Do nothing. */ - else if (c == '\b') - SCM_DECCOL (port); - else if (c == '\n') - SCM_INCLINE (port); - else if (c == '\r') - SCM_ZEROCOL (port); - else if (c == '\t') - SCM_TABCOL (port); + scm_t_wchar codepoint; + + if (utf8_buf[0] <= 0x7f) + { + assert (size == 1); + codepoint = utf8_buf[0]; + } + else if ((utf8_buf[0] & 0xe0) == 0xc0) + { + assert (size == 2); + codepoint = ((scm_t_wchar) utf8_buf[0] & 0x1f) << 6UL + | (utf8_buf[1] & 0x3f); + } + else if ((utf8_buf[0] & 0xf0) == 0xe0) + { + assert (size == 3); + codepoint = ((scm_t_wchar) utf8_buf[0] & 0x0f) << 12UL + | ((scm_t_wchar) utf8_buf[1] & 0x3f) << 6UL + | (utf8_buf[2] & 0x3f); + } else - SCM_INCCOL (port); + { + assert (size == 4); + codepoint = ((scm_t_wchar) utf8_buf[0] & 0x07) << 18UL + | ((scm_t_wchar) utf8_buf[1] & 0x3f) << 12UL + | ((scm_t_wchar) utf8_buf[2] & 0x3f) << 6UL + | (utf8_buf[3] & 0x3f); + } + + return codepoint; } -void -scm_lfwrite (const char *ptr, size_t size, SCM port) +/* Read a UTF-8 sequence from PORT. On success, return 0 and set + *CODEPOINT to the codepoint that was read, fill BUF with its UTF-8 + representation, and set *LEN to the length in bytes. Return + `EILSEQ' on error. */ +static int +get_utf8_codepoint (SCM port, scm_t_wchar *codepoint, + scm_t_uint8 buf[SCM_MBCHAR_BUF_SIZE], size_t *len) { - scm_t_port *pt = SCM_PTAB_ENTRY (port); - scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)]; - - if (pt->rw_active == SCM_PORT_READ) - scm_end_input (port); +#define ASSERT_NOT_EOF(b) \ + if (SCM_UNLIKELY ((b) == EOF)) \ + goto invalid_seq +#define CONSUME_PEEKED_BYTE() \ + pt->read_pos++ - ptob->write (port, ptr, size); + int byte; + scm_t_port *pt; - for (; size; ptr++, size--) - update_port_lf ((scm_t_wchar) (unsigned char) *ptr, port); + *len = 0; + pt = SCM_PTAB_ENTRY (port); - if (pt->rw_random) - pt->rw_active = SCM_PORT_WRITE; -} + byte = scm_get_byte_or_eof (port); + if (byte == EOF) + { + *codepoint = EOF; + return 0; + } -/* Write a scheme string STR to PORT from START inclusive to END - exclusive. */ -void -scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port) -{ - size_t i, size = scm_i_string_length (str); - scm_t_port *pt = SCM_PTAB_ENTRY (port); - scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)]; - scm_t_wchar p; - char *buf; - size_t len; + buf[0] = (scm_t_uint8) byte; + *len = 1; - if (pt->rw_active == SCM_PORT_READ) - scm_end_input (port); + if (buf[0] <= 0x7f) + /* 1-byte form. */ + *codepoint = buf[0]; + else if (buf[0] >= 0xc2 && buf[0] <= 0xdf) + { + /* 2-byte form. */ + byte = scm_peek_byte_or_eof (port); + ASSERT_NOT_EOF (byte); - if (end == -1) - end = size; - size = end - start; + if (SCM_UNLIKELY ((byte & 0xc0) != 0x80)) + goto invalid_seq; - buf = scm_to_stringn (scm_c_substring (str, start, end), &len, - NULL, iconveh_escape_sequence); - ptob->write (port, buf, len); - free (buf); + CONSUME_PEEKED_BYTE (); + buf[1] = (scm_t_uint8) byte; + *len = 2; - for (i = 0; i < size; i++) + *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL + | (buf[1] & 0x3f); + } + else if ((buf[0] & 0xf0) == 0xe0) { - p = scm_i_string_ref (str, i + start); - update_port_lf (p, port); + /* 3-byte form. */ + byte = scm_peek_byte_or_eof (port); + ASSERT_NOT_EOF (byte); + + if (SCM_UNLIKELY ((byte & 0xc0) != 0x80 + || (buf[0] == 0xe0 && byte < 0xa0) + || (buf[0] == 0xed && byte > 0x9f))) + goto invalid_seq; + + CONSUME_PEEKED_BYTE (); + buf[1] = (scm_t_uint8) byte; + *len = 2; + + byte = scm_peek_byte_or_eof (port); + ASSERT_NOT_EOF (byte); + + if (SCM_UNLIKELY ((byte & 0xc0) != 0x80)) + goto invalid_seq; + + CONSUME_PEEKED_BYTE (); + buf[2] = (scm_t_uint8) byte; + *len = 3; + + *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL + | ((scm_t_wchar) buf[1] & 0x3f) << 6UL + | (buf[2] & 0x3f); } + else if (buf[0] >= 0xf0 && buf[0] <= 0xf4) + { + /* 4-byte form. */ + byte = scm_peek_byte_or_eof (port); + ASSERT_NOT_EOF (byte); - if (pt->rw_random) - pt->rw_active = SCM_PORT_WRITE; -} + if (SCM_UNLIKELY (((byte & 0xc0) != 0x80) + || (buf[0] == 0xf0 && byte < 0x90) + || (buf[0] == 0xf4 && byte > 0x8f))) + goto invalid_seq; -/* Write a scheme string STR to PORT. */ -void -scm_lfwrite_str (SCM str, SCM port) -{ - scm_lfwrite_substr (str, 0, -1, port); -} + CONSUME_PEEKED_BYTE (); + buf[1] = (scm_t_uint8) byte; + *len = 2; -/* scm_c_read - * - * Used by an application to read arbitrary number of bytes from an - * SCM port. Same semantics as libc read, except that scm_c_read only - * returns less than SIZE bytes if at end-of-file. - * - * Warning: Doesn't update port line and column counts! */ + byte = scm_peek_byte_or_eof (port); + ASSERT_NOT_EOF (byte); -/* This structure, and the following swap_buffer function, are used - for temporarily swapping a port's own read buffer, and the buffer - that the caller of scm_c_read provides. */ -struct port_and_swap_buffer -{ - scm_t_port *pt; - unsigned char *buffer; - size_t size; -}; + if (SCM_UNLIKELY ((byte & 0xc0) != 0x80)) + goto invalid_seq; -static void -swap_buffer (void *data) -{ - struct port_and_swap_buffer *psb = (struct port_and_swap_buffer *) data; - unsigned char *old_buf = psb->pt->read_buf; - size_t old_size = psb->pt->read_buf_size; + CONSUME_PEEKED_BYTE (); + buf[2] = (scm_t_uint8) byte; + *len = 3; - /* Make the port use (buffer, size) from the struct. */ - psb->pt->read_pos = psb->pt->read_buf = psb->pt->read_end = psb->buffer; - psb->pt->read_buf_size = psb->size; + byte = scm_peek_byte_or_eof (port); + ASSERT_NOT_EOF (byte); - /* Save the port's old (buffer, size) in the struct. */ - psb->buffer = old_buf; - psb->size = old_size; + if (SCM_UNLIKELY ((byte & 0xc0) != 0x80)) + goto invalid_seq; + + CONSUME_PEEKED_BYTE (); + buf[3] = (scm_t_uint8) byte; + *len = 4; + + *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL + | ((scm_t_wchar) buf[1] & 0x3f) << 12UL + | ((scm_t_wchar) buf[2] & 0x3f) << 6UL + | (buf[3] & 0x3f); + } + else + goto invalid_seq; + + return 0; + + invalid_seq: + /* Here we could choose the consume the faulty byte when it's not a + valid starting byte, but it's not a requirement. What Section 3.9 + of Unicode 6.0.0 mandates, though, is to not consume a byte that + would otherwise be a valid starting byte. */ + + return EILSEQ; + +#undef CONSUME_PEEKED_BYTE +#undef ASSERT_NOT_EOF } -size_t -scm_c_read (SCM port, void *buffer, size_t size) -#define FUNC_NAME "scm_c_read" +/* Likewise, read a byte sequence from PORT, passing it through its + input conversion descriptor. */ +static int +get_iconv_codepoint (SCM port, scm_t_wchar *codepoint, + char buf[SCM_MBCHAR_BUF_SIZE], size_t *len) { - scm_t_port *pt; - size_t n_read = 0, n_available; - struct port_and_swap_buffer psb; + scm_t_iconv_descriptors *id; + scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE]; + size_t input_size = 0; - SCM_VALIDATE_OPINPORT (1, port); + id = scm_i_port_iconv_descriptors (port, SCM_PORT_READ); - pt = SCM_PTAB_ENTRY (port); - if (pt->rw_active == SCM_PORT_WRITE) - scm_ptobs[SCM_PTOBNUM (port)].flush (port); + for (;;) + { + int byte_read; + char *input, *output; + size_t input_left, output_left, done; - if (pt->rw_random) + byte_read = scm_get_byte_or_eof (port); + if (SCM_UNLIKELY (byte_read == EOF)) + { + if (SCM_LIKELY (input_size == 0)) + { + *codepoint = (scm_t_wchar) EOF; + *len = input_size; + return 0; + } + else + { + /* EOF found in the middle of a multibyte character. */ + scm_i_set_pending_eof (port); + return EILSEQ; + } + } + + buf[input_size++] = byte_read; + + input = buf; + input_left = input_size; + output = (char *) utf8_buf; + output_left = sizeof (utf8_buf); + + done = iconv (id->input_cd, &input, &input_left, &output, &output_left); + + if (done == (size_t) -1) + { + int err = errno; + if (SCM_LIKELY (err == EINVAL)) + /* The input byte sequence did not form a complete + character. Read another byte and try again. */ + continue; + else + return err; + } + else + { + size_t output_size = sizeof (utf8_buf) - output_left; + if (SCM_LIKELY (output_size > 0)) + { + /* iconv generated output. Convert the UTF8_BUF sequence + to a Unicode code point. */ + *codepoint = utf8_to_codepoint (utf8_buf, output_size); + *len = input_size; + return 0; + } + else + { + /* iconv consumed some bytes without producing any output. + Most likely this means that a Unicode byte-order mark + (BOM) was consumed, which should not be included in the + returned buf. Shift any remaining bytes to the beginning + of buf, and continue the loop. */ + memmove (buf, input, input_left); + input_size = input_left; + continue; + } + } + } +} + +/* Read a codepoint from PORT and return it in *CODEPOINT. Fill BUF + with the byte representation of the codepoint in PORT's encoding, and + set *LEN to the length in bytes of that representation. Return 0 on + success and an errno value on error. */ +static int +get_codepoint (SCM port, scm_t_wchar *codepoint, + char buf[SCM_MBCHAR_BUF_SIZE], size_t *len) +{ + int err; + scm_t_port *pt = SCM_PTAB_ENTRY (port); + scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port); + + if (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8) + err = get_utf8_codepoint (port, codepoint, (scm_t_uint8 *) buf, len); + else + err = get_iconv_codepoint (port, codepoint, buf, len); + + if (SCM_LIKELY (err == 0)) + { + if (SCM_UNLIKELY (pti->at_stream_start_for_bom_read)) + { + /* Record that we're no longer at stream start. */ + pti->at_stream_start_for_bom_read = 0; + if (pt->rw_random) + pti->at_stream_start_for_bom_write = 0; + + /* If we just read a BOM in an encoding that recognizes them, + then silently consume it and read another code point. */ + if (SCM_UNLIKELY + (*codepoint == SCM_UNICODE_BOM + && (pti->encoding_mode == SCM_PORT_ENCODING_MODE_UTF8 + || c_strcasecmp (pt->encoding, "UTF-16") == 0 + || c_strcasecmp (pt->encoding, "UTF-32") == 0))) + return get_codepoint (port, codepoint, buf, len); + } + update_port_lf (*codepoint, port); + } + else if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK) + { + *codepoint = '?'; + err = 0; + update_port_lf (*codepoint, port); + } + + return err; +} + +/* Read a codepoint from PORT and return it. */ +scm_t_wchar +scm_getc (SCM port) +#define FUNC_NAME "scm_getc" +{ + int err; + size_t len; + scm_t_wchar codepoint; + char buf[SCM_MBCHAR_BUF_SIZE]; + + err = get_codepoint (port, &codepoint, buf, &len); + if (SCM_UNLIKELY (err != 0)) + /* At this point PORT should point past the invalid encoding, as per + R6RS-lib Section 8.2.4. */ + scm_decoding_error (FUNC_NAME, err, "input decoding error", port); + + return codepoint; +} +#undef FUNC_NAME + +/* this should only be called when the read buffer is empty. it + tries to refill the read buffer. it returns the first char from + the port, which is either EOF or *(pt->read_pos). */ +static int +scm_i_fill_input (SCM port) +{ + scm_t_port *pt = SCM_PTAB_ENTRY (port); + scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port); + + assert (pt->read_pos == pt->read_end); + + if (pti->pending_eof) + { + pti->pending_eof = 0; + return EOF; + } + + if (pt->read_buf == pt->putback_buf) + { + /* finished reading put-back chars. */ + pt->read_buf = pt->saved_read_buf; + pt->read_pos = pt->saved_read_pos; + pt->read_end = pt->saved_read_end; + pt->read_buf_size = pt->saved_read_buf_size; + if (pt->read_pos < pt->read_end) + return *(pt->read_pos); + } + return scm_ptobs[SCM_PTOBNUM (port)].fill_input (port); +} + +int +scm_fill_input (SCM port) +{ + return scm_i_fill_input (port); +} + +/* Slow-path fallback for 'scm_get_byte_or_eof' in inline.h */ +int +scm_slow_get_byte_or_eof (SCM port) +{ + scm_t_port *pt = SCM_PTAB_ENTRY (port); + + if (pt->rw_active == SCM_PORT_WRITE) + scm_flush (port); + + if (pt->rw_random) + pt->rw_active = SCM_PORT_READ; + + if (pt->read_pos >= pt->read_end) + { + if (SCM_UNLIKELY (scm_i_fill_input (port) == EOF)) + return EOF; + } + + return *pt->read_pos++; +} + +/* Slow-path fallback for 'scm_peek_byte_or_eof' in inline.h */ +int +scm_slow_peek_byte_or_eof (SCM port) +{ + scm_t_port *pt = SCM_PTAB_ENTRY (port); + + if (pt->rw_active == SCM_PORT_WRITE) + scm_flush (port); + + if (pt->rw_random) + pt->rw_active = SCM_PORT_READ; + + if (pt->read_pos >= pt->read_end) + { + if (SCM_UNLIKELY (scm_i_fill_input (port) == EOF)) + { + scm_i_set_pending_eof (port); + return EOF; + } + } + + return *pt->read_pos; +} + + +/* scm_lfwrite + * + * This function differs from scm_c_write; it updates port line and + * column. */ + +void +scm_lfwrite (const char *ptr, size_t size, SCM port) +{ + scm_t_port *pt = SCM_PTAB_ENTRY (port); + scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)]; + + if (pt->rw_active == SCM_PORT_READ) + scm_end_input (port); + + ptob->write (port, ptr, size); + + for (; size; ptr++, size--) + update_port_lf ((scm_t_wchar) (unsigned char) *ptr, port); + + if (pt->rw_random) + pt->rw_active = SCM_PORT_WRITE; +} + +/* Write STR to PORT from START inclusive to END exclusive. */ +void +scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port) +{ + scm_t_port *pt = SCM_PTAB_ENTRY (port); + + if (pt->rw_active == SCM_PORT_READ) + scm_end_input (port); + + if (end == (size_t) -1) + end = scm_i_string_length (str); + + scm_i_display_substring (str, start, end, port); + + if (pt->rw_random) + pt->rw_active = SCM_PORT_WRITE; +} + +/* scm_c_read + * + * Used by an application to read arbitrary number of bytes from an + * SCM port. Same semantics as libc read, except that scm_c_read only + * returns less than SIZE bytes if at end-of-file. + * + * Warning: Doesn't update port line and column counts! */ + +/* This structure, and the following swap_buffer function, are used + for temporarily swapping a port's own read buffer, and the buffer + that the caller of scm_c_read provides. */ +struct port_and_swap_buffer +{ + scm_t_port *pt; + unsigned char *buffer; + size_t size; +}; + +static void +swap_buffer (void *data) +{ + struct port_and_swap_buffer *psb = (struct port_and_swap_buffer *) data; + unsigned char *old_buf = psb->pt->read_buf; + size_t old_size = psb->pt->read_buf_size; + + /* Make the port use (buffer, size) from the struct. */ + psb->pt->read_pos = psb->pt->read_buf = psb->pt->read_end = psb->buffer; + psb->pt->read_buf_size = psb->size; + + /* Save the port's old (buffer, size) in the struct. */ + psb->buffer = old_buf; + psb->size = old_size; +} + +size_t +scm_c_read (SCM port, void *buffer, size_t size) +#define FUNC_NAME "scm_c_read" +{ + scm_t_port *pt; + size_t n_read = 0, n_available; + struct port_and_swap_buffer psb; + + SCM_VALIDATE_OPINPORT (1, port); + + pt = SCM_PTAB_ENTRY (port); + if (pt->rw_active == SCM_PORT_WRITE) + scm_ptobs[SCM_PTOBNUM (port)].flush (port); + + if (pt->rw_random) pt->rw_active = SCM_PORT_READ; /* Take bytes first from the port's read buffer. */ @@ -1111,23 +1672,32 @@ scm_c_read (SCM port, void *buffer, size_t size) if (size == 0) return n_read; - /* Now we will call scm_fill_input repeatedly until we have read the - requested number of bytes. (Note that a single scm_fill_input + /* Now we will call scm_i_fill_input repeatedly until we have read the + requested number of bytes. (Note that a single scm_i_fill_input call does not guarantee to fill the whole of the port's read buffer.) */ - if (pt->read_buf_size <= 1) + if (pt->read_buf_size <= 1 && + (pt->encoding == NULL + || c_strcasecmp (pt->encoding, "ISO-8859-1") == 0)) { /* The port that we are reading from is unbuffered - i.e. does not have its own persistent buffer - but we have a buffer, provided by our caller, that is the right size for the data - that is wanted. For the following scm_fill_input calls, + that is wanted. For the following scm_i_fill_input calls, therefore, we use the buffer in hand as the port's read buffer. We need to make sure that the port's normal (1 byte) buffer - is reinstated in case one of the scm_fill_input () calls + is reinstated in case one of the scm_i_fill_input () calls throws an exception; we use the scm_dynwind_* API to achieve - that. */ + that. + + A consequence of this optimization is that the fill_input + functions can't unget characters. That'll push data to the + pushback buffer instead of this psb buffer. */ +#if SCM_DEBUG == 1 + unsigned char *pback = pt->putback_buf; +#endif psb.pt = pt; psb.buffer = buffer; psb.size = size; @@ -1135,15 +1705,22 @@ scm_c_read (SCM port, void *buffer, size_t size) scm_dynwind_rewind_handler (swap_buffer, &psb, SCM_F_WIND_EXPLICITLY); scm_dynwind_unwind_handler (swap_buffer, &psb, SCM_F_WIND_EXPLICITLY); - /* Call scm_fill_input until we have all the bytes that we need, + /* Call scm_i_fill_input until we have all the bytes that we need, or we hit EOF. */ - while (pt->read_buf_size && (scm_fill_input (port) != EOF)) + while (pt->read_buf_size && (scm_i_fill_input (port) != EOF)) { pt->read_buf_size -= (pt->read_end - pt->read_pos); pt->read_pos = pt->read_buf = pt->read_end; } +#if SCM_DEBUG == 1 + if (pback != pt->putback_buf + || pt->read_buf - (unsigned char *) buffer < 0) + scm_misc_error (FUNC_NAME, + "scm_c_read must not call a fill function that pushes " + "back characters onto an unbuffered port", SCM_EOL); +#endif n_read += pt->read_buf - (unsigned char *) buffer; - + /* Reinstate the port's normal buffer. */ scm_dynwind_end (); } @@ -1154,7 +1731,7 @@ scm_c_read (SCM port, void *buffer, size_t size) that a custom port implementation's entry points (in particular, fill_input) can rely on the buffer always being the same as they first set up. */ - while (size && (scm_fill_input (port) != EOF)) + while (size && (scm_i_fill_input (port) != EOF)) { n_available = min (size, pt->read_end - pt->read_pos); memcpy (buffer, pt->read_pos, n_available); @@ -1201,10 +1778,11 @@ scm_c_write (SCM port, const void *ptr, size_t size) } #undef FUNC_NAME -void +void scm_flush (SCM port) { long i = SCM_PTOBNUM (port); + assert (i >= 0); (scm_ptobs[i].flush) (port); } @@ -1214,6 +1792,7 @@ scm_end_input (SCM port) long offset; scm_t_port *pt = SCM_PTAB_ENTRY (port); + scm_i_clear_pending_eof (port); if (pt->read_buf == pt->putback_buf) { offset = pt->read_end - pt->read_pos; @@ -1231,51 +1810,25 @@ scm_end_input (SCM port) -void -scm_ungetc (int c, SCM port) -#define FUNC_NAME "scm_ungetc" +static void +scm_i_unget_bytes (const unsigned char *buf, size_t len, SCM port) +#define FUNC_NAME "scm_unget_bytes" { scm_t_port *pt = SCM_PTAB_ENTRY (port); + size_t old_len, new_len; - if (pt->read_buf == pt->putback_buf) - /* already using the put-back buffer. */ - { - /* enlarge putback_buf if necessary. */ - if (pt->read_end == pt->read_buf + pt->read_buf_size - && pt->read_buf == pt->read_pos) - { - size_t new_size = pt->read_buf_size * 2; - unsigned char *tmp = (unsigned char *) - scm_gc_realloc (pt->putback_buf, pt->read_buf_size, new_size, - "putback buffer"); - - pt->read_pos = pt->read_buf = pt->putback_buf = tmp; - pt->read_end = pt->read_buf + pt->read_buf_size; - pt->read_buf_size = pt->putback_buf_size = new_size; - } + scm_i_clear_pending_eof (port); - /* shift any existing bytes to buffer + 1. */ - if (pt->read_pos == pt->read_end) - pt->read_end = pt->read_buf + 1; - else if (pt->read_pos != pt->read_buf + 1) - { - int count = pt->read_end - pt->read_pos; - - memmove (pt->read_buf + 1, pt->read_pos, count); - pt->read_end = pt->read_buf + 1 + count; - } - - pt->read_pos = pt->read_buf; - } - else + if (pt->read_buf != pt->putback_buf) /* switch to the put-back buffer. */ { if (pt->putback_buf == NULL) { + pt->putback_buf_size = (len > SCM_INITIAL_PUTBACK_BUF_SIZE + ? len : SCM_INITIAL_PUTBACK_BUF_SIZE); pt->putback_buf - = (unsigned char *) scm_gc_malloc (SCM_INITIAL_PUTBACK_BUF_SIZE, - "putback buffer"); - pt->putback_buf_size = SCM_INITIAL_PUTBACK_BUF_SIZE; + = (unsigned char *) scm_gc_malloc_pointerless + (pt->putback_buf_size, "putback buffer"); } pt->saved_read_buf = pt->read_buf; @@ -1283,15 +1836,110 @@ scm_ungetc (int c, SCM port) pt->saved_read_end = pt->read_end; pt->saved_read_buf_size = pt->read_buf_size; - pt->read_pos = pt->read_buf = pt->putback_buf; - pt->read_end = pt->read_buf + 1; + /* Put read_pos at the end of the buffer, so that ungets will not + have to shift the buffer contents each time. */ + pt->read_buf = pt->putback_buf; + pt->read_pos = pt->read_end = pt->putback_buf + pt->putback_buf_size; pt->read_buf_size = pt->putback_buf_size; } - *pt->read_buf = c; + old_len = pt->read_end - pt->read_pos; + new_len = old_len + len; + + if (new_len > pt->read_buf_size) + /* The putback buffer needs to be enlarged. */ + { + size_t new_buf_size; + unsigned char *new_buf, *new_end, *new_pos; + + new_buf_size = pt->read_buf_size * 2; + if (new_buf_size < new_len) + new_buf_size = new_len; + + new_buf = (unsigned char *) + scm_gc_malloc_pointerless (new_buf_size, "putback buffer"); + + /* Put the bytes at the end of the buffer, so that future + ungets won't need to shift the buffer. */ + new_end = new_buf + new_buf_size; + new_pos = new_end - old_len; + memcpy (new_pos, pt->read_pos, old_len); + + pt->read_buf = pt->putback_buf = new_buf; + pt->read_pos = new_pos; + pt->read_end = new_end; + pt->read_buf_size = pt->putback_buf_size = new_buf_size; + } + else if (pt->read_buf + len < pt->read_pos) + /* If needed, shift the existing buffer contents up. + This should not happen unless some external code + manipulates the putback buffer pointers. */ + { + unsigned char *new_end = pt->read_buf + pt->read_buf_size; + unsigned char *new_pos = new_end - old_len; + + memmove (new_pos, pt->read_pos, old_len); + pt->read_pos = new_pos; + pt->read_end = new_end; + } + + /* Move read_pos back and copy the bytes there. */ + pt->read_pos -= len; + memcpy (pt->read_buf + (pt->read_pos - pt->read_buf), buf, len); + + if (pt->rw_active == SCM_PORT_WRITE) + scm_flush (port); if (pt->rw_random) pt->rw_active = SCM_PORT_READ; +} +#undef FUNC_NAME + +void +scm_unget_bytes (const unsigned char *buf, size_t len, SCM port) +{ + scm_i_unget_bytes (buf, len, port); +} + +void +scm_unget_byte (int c, SCM port) +{ + unsigned char byte; + + byte = c; + scm_i_unget_bytes (&byte, 1, port); +} + +void +scm_ungetc (scm_t_wchar c, SCM port) +#define FUNC_NAME "scm_ungetc" +{ + scm_t_port *pt = SCM_PTAB_ENTRY (port); + char *result; + char result_buf[10]; + const char *encoding; + size_t len; + + if (pt->encoding != NULL) + encoding = pt->encoding; + else + encoding = "ISO-8859-1"; + + len = sizeof (result_buf); + result = u32_conv_to_encoding (encoding, + (enum iconv_ilseq_handler) pt->ilseq_handler, + (uint32_t *) &c, 1, NULL, + result_buf, &len); + + if (SCM_UNLIKELY (result == NULL || len == 0)) + scm_encoding_error (FUNC_NAME, errno, + "conversion to port encoding failed", + SCM_BOOL_F, SCM_MAKE_CHAR (c)); + + scm_i_unget_bytes ((unsigned char *) result, len, port); + + if (SCM_UNLIKELY (result != result_buf)) + free (result); if (c == '\n') { @@ -1335,30 +1983,61 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0, "return the value returned by the preceding call to\n" "@code{peek-char}. In particular, a call to @code{peek-char} on\n" "an interactive port will hang waiting for input whenever a call\n" - "to @code{read-char} would have hung.") + "to @code{read-char} would have hung.\n" + "\n" + "As for @code{read-char}, a @code{decoding-error} may be raised\n" + "if such a situation occurs. However, unlike with @code{read-char},\n" + "@var{port} still points at the beginning of the erroneous byte\n" + "sequence when the error is raised.\n") #define FUNC_NAME s_scm_peek_char { - int c, column; + int err; + SCM result; + scm_t_wchar c; + char bytes[SCM_MBCHAR_BUF_SIZE]; + long column, line; + size_t len; + if (SCM_UNBNDP (port)) port = scm_current_input_port (); + SCM_VALIDATE_OPINPORT (1, port); + + column = SCM_COL (port); + line = SCM_LINUM (port); + + err = get_codepoint (port, &c, bytes, &len); + + scm_i_unget_bytes ((unsigned char *) bytes, len, port); + + SCM_COL (port) = column; + SCM_LINUM (port) = line; + + if (SCM_UNLIKELY (err != 0)) + { + scm_decoding_error (FUNC_NAME, err, "input decoding error", port); + + /* Shouldn't happen since `catch' always aborts to prompt. */ + result = SCM_BOOL_F; + } + else if (c == EOF) + { + scm_i_set_pending_eof (port); + result = SCM_EOF_VAL; + } else - SCM_VALIDATE_OPINPORT (1, port); - column = SCM_COL(port); - c = scm_getc (port); - if (EOF == c) - return SCM_EOF_VAL; - scm_ungetc (c, port); - SCM_COL(port) = column; - return SCM_MAKE_CHAR (c); + result = SCM_MAKE_CHAR (c); + + return result; } #undef FUNC_NAME SCM_DEFINE (scm_unread_char, "unread-char", 1, 1, 0, (SCM cobj, SCM port), - "Place @var{char} in @var{port} so that it will be read by the\n" - "next read operation. If called multiple times, the unread characters\n" - "will be read again in last-in first-out order. If @var{port} is\n" - "not supplied, the current input port is used.") + "Place character @var{cobj} in @var{port} so that it will be\n" + "read by the next read operation. If called multiple times, the\n" + "unread characters will be read again in last-in first-out\n" + "order. If @var{port} is not supplied, the current input port\n" + "is used.") #define FUNC_NAME s_scm_unread_char { int c; @@ -1366,8 +2045,7 @@ SCM_DEFINE (scm_unread_char, "unread-char", 1, 1, 0, SCM_VALIDATE_CHAR (1, cobj); if (SCM_UNBNDP (port)) port = scm_current_input_port (); - else - SCM_VALIDATE_OPINPORT (2, port); + SCM_VALIDATE_OPINPORT (2, port); c = SCM_CHAR (cobj); @@ -1384,13 +2062,16 @@ SCM_DEFINE (scm_unread_string, "unread-string", 2, 0, 0, "@var{port} is not supplied, the current-input-port is used.") #define FUNC_NAME s_scm_unread_string { + int n; SCM_VALIDATE_STRING (1, str); if (SCM_UNBNDP (port)) port = scm_current_input_port (); - else - SCM_VALIDATE_OPINPORT (2, port); + SCM_VALIDATE_OPINPORT (2, port); + + n = scm_i_string_length (str); - scm_ungets (scm_i_string_chars (str), scm_i_string_length (str), port); + while (n--) + scm_ungetc (scm_i_string_ref (str, n), port); return str; } @@ -1398,7 +2079,7 @@ SCM_DEFINE (scm_unread_string, "unread-string", 2, 0, 0, SCM_DEFINE (scm_seek, "seek", 3, 0, 0, (SCM fd_port, SCM offset, SCM whence), - "Sets the current position of @var{fd/port} to the integer\n" + "Sets the current position of @var{fd_port} to the integer\n" "@var{offset}, which is interpreted according to the value of\n" "@var{whence}.\n" "\n" @@ -1413,7 +2094,7 @@ SCM_DEFINE (scm_seek, "seek", 3, 0, 0, "@defvar SEEK_END\n" "Seek from the end of the file.\n" "@end defvar\n" - "If @var{fd/port} is a file descriptor, the underlying system\n" + "If @var{fd_port} is a file descriptor, the underlying system\n" "call is @code{lseek}. @var{port} may be a string port.\n" "\n" "The value returned is the new position in the file. This means\n" @@ -1433,6 +2114,7 @@ SCM_DEFINE (scm_seek, "seek", 3, 0, 0, if (SCM_OPPORTP (fd_port)) { + scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (fd_port); scm_t_ptob_descriptor *ptob = scm_ptobs + SCM_PTOBNUM (fd_port); off_t_or_off64_t off = scm_to_off_t_or_off64_t (offset); off_t_or_off64_t rv; @@ -1441,7 +2123,14 @@ SCM_DEFINE (scm_seek, "seek", 3, 0, 0, SCM_MISC_ERROR ("port is not seekable", scm_cons (fd_port, SCM_EOL)); else - rv = ptob->seek (fd_port, off, how); + rv = ptob->seek (fd_port, off, how); + + /* Set stream-start flags according to new position. */ + pti->at_stream_start_for_bom_read = (rv == 0); + pti->at_stream_start_for_bom_write = (rv == 0); + + scm_i_clear_pending_eof (fd_port); + return scm_from_off_t_or_off64_t (rv); } else /* file descriptor?. */ @@ -1487,8 +2176,9 @@ truncate (const char *file, off_t length) SCM_DEFINE (scm_truncate_file, "truncate-file", 1, 1, 0, (SCM object, SCM length), - "Truncate @var{file} to @var{length} bytes. @var{file} can be a\n" - "filename string, a port object, or an integer file descriptor.\n" + "Truncate file @var{object} to @var{length} bytes. @var{object}\n" + "can be a filename string, a port object, or an integer file\n" + "descriptor.\n" "The return value is unspecified.\n" "\n" "For a port or file descriptor @var{length} can be omitted, in\n" @@ -1529,14 +2219,16 @@ SCM_DEFINE (scm_truncate_file, "truncate-file", 1, 1, 0, off_t_or_off64_t c_length = scm_to_off_t_or_off64_t (length); scm_t_port *pt = SCM_PTAB_ENTRY (object); scm_t_ptob_descriptor *ptob = scm_ptobs + SCM_PTOBNUM (object); - + if (!ptob->truncate) SCM_MISC_ERROR ("port is not truncatable", SCM_EOL); + + scm_i_clear_pending_eof (object); if (pt->rw_active == SCM_PORT_READ) scm_end_input (object); else if (pt->rw_active == SCM_PORT_WRITE) ptob->flush (object); - + ptob->truncate (object, c_length); rv = 0; } @@ -1618,9 +2310,8 @@ SCM_DEFINE (scm_set_port_column_x, "set-port-column!", 2, 0, 0, SCM_DEFINE (scm_port_filename, "port-filename", 1, 0, 0, (SCM port), - "Return the filename associated with @var{port}. This function returns\n" - "the strings \"standard input\", \"standard output\" and \"standard error\"\n" - "when called on the current input, output and error ports respectively.") + "Return the filename associated with @var{port}, or @code{#f}\n" + "if no filename is associated with the port.") #define FUNC_NAME s_scm_port_filename { port = SCM_COERCE_OUTPORT (port); @@ -1645,6 +2336,468 @@ SCM_DEFINE (scm_set_port_filename_x, "set-port-filename!", 2, 0, 0, } #undef FUNC_NAME +/* A fluid specifying the default encoding for newly created ports. If it is + a string, that is the encoding. If it is #f, it is in the "native" + (Latin-1) encoding. */ +SCM_VARIABLE (default_port_encoding_var, "%default-port-encoding"); + +static int scm_port_encoding_init = 0; + +/* Use ENCODING as the default encoding for future ports. */ +void +scm_i_set_default_port_encoding (const char *encoding) +{ + if (!scm_port_encoding_init + || !scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var))) + scm_misc_error (NULL, "tried to set port encoding fluid before it is initialized", + SCM_EOL); + + if (encoding == NULL + || c_strcasecmp (encoding, "ASCII") == 0 + || c_strcasecmp (encoding, "ANSI_X3.4-1968") == 0 + || c_strcasecmp (encoding, "ISO-8859-1") == 0) + scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), SCM_BOOL_F); + else + scm_fluid_set_x (SCM_VARIABLE_REF (default_port_encoding_var), + scm_from_locale_string (encoding)); +} + +/* Return the name of the default encoding for newly created ports; a + return value of NULL means "ISO-8859-1". */ +const char * +scm_i_default_port_encoding (void) +{ + if (!scm_port_encoding_init) + return NULL; + else if (!scm_is_fluid (SCM_VARIABLE_REF (default_port_encoding_var))) + return NULL; + else + { + SCM encoding; + + encoding = scm_fluid_ref (SCM_VARIABLE_REF (default_port_encoding_var)); + if (!scm_is_string (encoding)) + return NULL; + else + return scm_i_string_chars (encoding); + } +} + +/* If the next LEN bytes from PORT are equal to those in BYTES, then + return 1, else return 0. Leave the port position unchanged. */ +static int +looking_at_bytes (SCM port, const unsigned char *bytes, int len) +{ + scm_t_port *pt = SCM_PTAB_ENTRY (port); + int i = 0; + + while (i < len && scm_peek_byte_or_eof (port) == bytes[i]) + { + pt->read_pos++; + i++; + } + scm_i_unget_bytes (bytes, i, port); + return (i == len); +} + +static const unsigned char scm_utf8_bom[3] = {0xEF, 0xBB, 0xBF}; +static const unsigned char scm_utf16be_bom[2] = {0xFE, 0xFF}; +static const unsigned char scm_utf16le_bom[2] = {0xFF, 0xFE}; +static const unsigned char scm_utf32be_bom[4] = {0x00, 0x00, 0xFE, 0xFF}; +static const unsigned char scm_utf32le_bom[4] = {0xFF, 0xFE, 0x00, 0x00}; + +/* Decide what byte order to use for a UTF-16 port. Return "UTF-16BE" + or "UTF-16LE". MODE must be either SCM_PORT_READ or SCM_PORT_WRITE, + and specifies which operation is about to be done. The MODE + determines how we will decide the byte order. We deliberately avoid + reading from the port unless the user is about to do so. If the user + is about to read, then we look for a BOM, and if present, we use it + to determine the byte order. Otherwise we choose big endian, as + recommended by the Unicode Standard. Note that the BOM (if any) is + not consumed here. */ +static const char * +decide_utf16_encoding (SCM port, scm_t_port_rw_active mode) +{ + if (mode == SCM_PORT_READ + && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read + && looking_at_bytes (port, scm_utf16le_bom, sizeof scm_utf16le_bom)) + return "UTF-16LE"; + else + return "UTF-16BE"; +} + +/* Decide what byte order to use for a UTF-32 port. Return "UTF-32BE" + or "UTF-32LE". See the comment above 'decide_utf16_encoding' for + details. */ +static const char * +decide_utf32_encoding (SCM port, scm_t_port_rw_active mode) +{ + if (mode == SCM_PORT_READ + && SCM_PORT_GET_INTERNAL (port)->at_stream_start_for_bom_read + && looking_at_bytes (port, scm_utf32le_bom, sizeof scm_utf32le_bom)) + return "UTF-32LE"; + else + return "UTF-32BE"; +} + +static void +finalize_iconv_descriptors (void *ptr, void *data) +{ + close_iconv_descriptors (ptr); +} + +static scm_t_iconv_descriptors * +open_iconv_descriptors (const char *encoding, int reading, int writing) +{ + scm_t_iconv_descriptors *id; + iconv_t input_cd, output_cd; + + input_cd = (iconv_t) -1; + output_cd = (iconv_t) -1; + if (reading) + { + /* Open an input iconv conversion descriptor, from ENCODING + to UTF-8. We choose UTF-8, not UTF-32, because iconv + implementations can typically convert from anything to + UTF-8, but not to UTF-32 (see + ). */ + + /* Assume opening an iconv descriptor causes about 16 KB of + allocation. */ + scm_gc_register_allocation (16 * 1024); + + input_cd = iconv_open ("UTF-8", encoding); + if (input_cd == (iconv_t) -1) + goto invalid_encoding; + } + + if (writing) + { + /* Assume opening an iconv descriptor causes about 16 KB of + allocation. */ + scm_gc_register_allocation (16 * 1024); + + output_cd = iconv_open (encoding, "UTF-8"); + if (output_cd == (iconv_t) -1) + { + if (input_cd != (iconv_t) -1) + iconv_close (input_cd); + goto invalid_encoding; + } + } + + id = scm_gc_malloc_pointerless (sizeof (*id), "iconv descriptors"); + id->input_cd = input_cd; + id->output_cd = output_cd; + + /* Register a finalizer to close the descriptors. */ + scm_i_set_finalizer (id, finalize_iconv_descriptors, NULL); + + return id; + + invalid_encoding: + { + SCM err; + err = scm_from_locale_string (encoding); + scm_misc_error ("open_iconv_descriptors", + "invalid or unknown character encoding ~s", + scm_list_1 (err)); + } +} + +static void +close_iconv_descriptors (scm_t_iconv_descriptors *id) +{ + if (id->input_cd != (iconv_t) -1) + iconv_close (id->input_cd); + if (id->output_cd != (iconv_t) -1) + iconv_close (id->output_cd); + id->input_cd = (void *) -1; + id->output_cd = (void *) -1; +} + +/* Return the iconv_descriptors, initializing them if necessary. MODE + must be either SCM_PORT_READ or SCM_PORT_WRITE, and specifies which + operation is about to be done. We deliberately avoid reading from + the port unless the user was about to do so. */ +scm_t_iconv_descriptors * +scm_i_port_iconv_descriptors (SCM port, scm_t_port_rw_active mode) +{ + scm_t_port_internal *pti = SCM_PORT_GET_INTERNAL (port); + + assert (pti->encoding_mode == SCM_PORT_ENCODING_MODE_ICONV); + + if (!pti->iconv_descriptors) + { + scm_t_port *pt = SCM_PTAB_ENTRY (port); + const char *precise_encoding; + + if (!pt->encoding) + pt->encoding = "ISO-8859-1"; + + /* If the specified encoding is UTF-16 or UTF-32, then make + that more precise by deciding what byte order to use. */ + if (c_strcasecmp (pt->encoding, "UTF-16") == 0) + precise_encoding = decide_utf16_encoding (port, mode); + else if (c_strcasecmp (pt->encoding, "UTF-32") == 0) + precise_encoding = decide_utf32_encoding (port, mode); + else + precise_encoding = pt->encoding; + + pti->iconv_descriptors = + open_iconv_descriptors (precise_encoding, + SCM_INPUT_PORT_P (port), + SCM_OUTPUT_PORT_P (port)); + } + + return pti->iconv_descriptors; +} + +void +scm_i_set_port_encoding_x (SCM port, const char *encoding) +{ + scm_t_port *pt; + scm_t_port_internal *pti; + scm_t_iconv_descriptors *prev; + + /* Set the character encoding for this port. */ + pt = SCM_PTAB_ENTRY (port); + pti = SCM_PORT_GET_INTERNAL (port); + prev = pti->iconv_descriptors; + + /* In order to handle cases where the encoding changes mid-stream + (e.g. within an HTTP stream, or within a file that is composed of + segments with different encodings), we consider this to be "stream + start" for purposes of BOM handling, regardless of our actual file + position. */ + pti->at_stream_start_for_bom_read = 1; + pti->at_stream_start_for_bom_write = 1; + + if (encoding == NULL) + encoding = "ISO-8859-1"; + + /* If ENCODING is UTF-8, then no conversion descriptor is opened + because we do I/O ourselves. This saves 100+ KiB for each + descriptor. */ + pt->encoding = scm_gc_strdup (encoding, "port"); + if (c_strcasecmp (encoding, "UTF-8") == 0) + pti->encoding_mode = SCM_PORT_ENCODING_MODE_UTF8; + else + pti->encoding_mode = SCM_PORT_ENCODING_MODE_ICONV; + + pti->iconv_descriptors = NULL; + if (prev) + close_iconv_descriptors (prev); +} + +SCM_DEFINE (scm_port_encoding, "port-encoding", 1, 0, 0, + (SCM port), + "Returns, as a string, the character encoding that @var{port}\n" + "uses to interpret its input and output.\n") +#define FUNC_NAME s_scm_port_encoding +{ + scm_t_port *pt; + const char *enc; + + SCM_VALIDATE_PORT (1, port); + + pt = SCM_PTAB_ENTRY (port); + enc = pt->encoding; + if (enc) + return scm_from_locale_string (pt->encoding); + else + return SCM_BOOL_F; +} +#undef FUNC_NAME + +SCM_DEFINE (scm_set_port_encoding_x, "set-port-encoding!", 2, 0, 0, + (SCM port, SCM enc), + "Sets the character encoding that will be used to interpret all\n" + "port I/O. New ports are created with the encoding\n" + "appropriate for the current locale if @code{setlocale} has \n" + "been called or ISO-8859-1 otherwise\n" + "and this procedure can be used to modify that encoding.\n") +#define FUNC_NAME s_scm_set_port_encoding_x +{ + char *enc_str; + + SCM_VALIDATE_PORT (1, port); + SCM_VALIDATE_STRING (2, enc); + + enc_str = scm_to_locale_string (enc); + scm_i_set_port_encoding_x (port, enc_str); + free (enc_str); + + return SCM_UNSPECIFIED; +} +#undef FUNC_NAME + + +/* A fluid specifying the default conversion handler for newly created + ports. Its value should be one of the symbols below. */ +SCM_VARIABLE (default_conversion_strategy_var, + "%default-port-conversion-strategy"); + +/* Whether the above fluid is initialized. */ +static int scm_conversion_strategy_init = 0; + +/* The possible conversion strategies. */ +SCM_SYMBOL (sym_error, "error"); +SCM_SYMBOL (sym_substitute, "substitute"); +SCM_SYMBOL (sym_escape, "escape"); + +/* Return the default failed encoding conversion policy for new created + ports. */ +scm_t_string_failed_conversion_handler +scm_i_default_port_conversion_handler (void) +{ + scm_t_string_failed_conversion_handler handler; + + if (!scm_conversion_strategy_init + || !scm_is_fluid (SCM_VARIABLE_REF (default_conversion_strategy_var))) + handler = SCM_FAILED_CONVERSION_QUESTION_MARK; + else + { + SCM fluid, value; + + fluid = SCM_VARIABLE_REF (default_conversion_strategy_var); + value = scm_fluid_ref (fluid); + + if (scm_is_eq (sym_substitute, value)) + handler = SCM_FAILED_CONVERSION_QUESTION_MARK; + else if (scm_is_eq (sym_escape, value)) + handler = SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE; + else + /* Default to 'error also when the fluid's value is not one of + the valid symbols. */ + handler = SCM_FAILED_CONVERSION_ERROR; + } + + return handler; +} + +/* Use HANDLER as the default conversion strategy for future ports. */ +void +scm_i_set_default_port_conversion_handler (scm_t_string_failed_conversion_handler + handler) +{ + SCM strategy; + + if (!scm_conversion_strategy_init + || !scm_is_fluid (SCM_VARIABLE_REF (default_conversion_strategy_var))) + scm_misc_error (NULL, "tried to set conversion strategy fluid before it is initialized", + SCM_EOL); + + switch (handler) + { + case SCM_FAILED_CONVERSION_ERROR: + strategy = sym_error; + break; + + case SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE: + strategy = sym_escape; + break; + + case SCM_FAILED_CONVERSION_QUESTION_MARK: + strategy = sym_substitute; + break; + + default: + abort (); + } + + scm_fluid_set_x (SCM_VARIABLE_REF (default_conversion_strategy_var), + strategy); +} + +SCM_DEFINE (scm_port_conversion_strategy, "port-conversion-strategy", + 1, 0, 0, (SCM port), + "Returns the behavior of the port when handling a character that\n" + "is not representable in the port's current encoding.\n" + "It returns the symbol @code{error} if unrepresentable characters\n" + "should cause exceptions, @code{substitute} if the port should\n" + "try to replace unrepresentable characters with question marks or\n" + "approximate characters, or @code{escape} if unrepresentable\n" + "characters should be converted to string escapes.\n" + "\n" + "If @var{port} is @code{#f}, then the current default behavior\n" + "will be returned. New ports will have this default behavior\n" + "when they are created.\n") +#define FUNC_NAME s_scm_port_conversion_strategy +{ + scm_t_string_failed_conversion_handler h; + + if (scm_is_false (port)) + h = scm_i_default_port_conversion_handler (); + else + { + scm_t_port *pt; + + SCM_VALIDATE_OPPORT (1, port); + pt = SCM_PTAB_ENTRY (port); + + h = pt->ilseq_handler; + } + + if (h == SCM_FAILED_CONVERSION_ERROR) + return scm_from_latin1_symbol ("error"); + else if (h == SCM_FAILED_CONVERSION_QUESTION_MARK) + return scm_from_latin1_symbol ("substitute"); + else if (h == SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE) + return scm_from_latin1_symbol ("escape"); + else + abort (); + + /* Never gets here. */ + return SCM_UNDEFINED; +} +#undef FUNC_NAME + +SCM_DEFINE (scm_set_port_conversion_strategy_x, "set-port-conversion-strategy!", + 2, 0, 0, + (SCM port, SCM sym), + "Sets the behavior of the interpreter when outputting a character\n" + "that is not representable in the port's current encoding.\n" + "@var{sym} can be either @code{'error}, @code{'substitute}, or\n" + "@code{'escape}. If it is @code{'error}, an error will be thrown\n" + "when an unconvertible character is encountered. If it is\n" + "@code{'substitute}, then unconvertible characters will \n" + "be replaced with approximate characters, or with question marks\n" + "if no approximately correct character is available.\n" + "If it is @code{'escape},\n" + "it will appear as a hex escape when output.\n" + "\n" + "If @var{port} is an open port, the conversion error behavior\n" + "is set for that port. If it is @code{#f}, it is set as the\n" + "default behavior for any future ports that get created in\n" + "this thread.\n") +#define FUNC_NAME s_scm_set_port_conversion_strategy_x +{ + scm_t_string_failed_conversion_handler handler; + + if (scm_is_eq (sym, sym_error)) + handler = SCM_FAILED_CONVERSION_ERROR; + else if (scm_is_eq (sym, sym_substitute)) + handler = SCM_FAILED_CONVERSION_QUESTION_MARK; + else if (scm_is_eq (sym, sym_escape)) + handler = SCM_FAILED_CONVERSION_ESCAPE_SEQUENCE; + else + SCM_MISC_ERROR ("unknown conversion strategy ~s", scm_list_1 (sym)); + + if (scm_is_false (port)) + scm_i_set_default_port_conversion_handler (handler); + else + { + SCM_VALIDATE_OPPORT (1, port); + SCM_PTAB_ENTRY (port)->ilseq_handler = handler; + } + + return SCM_UNSPECIFIED; +} +#undef FUNC_NAME + + + void scm_print_port_mode (SCM exp, SCM port) { @@ -1675,13 +2828,6 @@ scm_port_print (SCM exp, SCM port, scm_print_state *pstate SCM_UNUSED) return 1; } -void -scm_ports_prehistory () -{ - scm_numptob = 0; - scm_ptobs = (scm_t_ptob_descriptor *) scm_malloc (sizeof (scm_t_ptob_descriptor)); -} - /* Void ports. */ @@ -1749,14 +2895,29 @@ scm_init_ports () scm_tc16_void_port = scm_make_port_type ("void", fill_input_void_port, write_void_port); - cur_inport_fluid = scm_permanent_object (scm_make_fluid ()); - cur_outport_fluid = scm_permanent_object (scm_make_fluid ()); - cur_errport_fluid = scm_permanent_object (scm_make_fluid ()); - cur_loadport_fluid = scm_permanent_object (scm_make_fluid ()); + cur_inport_fluid = scm_make_fluid (); + cur_outport_fluid = scm_make_fluid (); + cur_errport_fluid = scm_make_fluid (); + cur_loadport_fluid = scm_make_fluid (); + + scm_i_port_weak_hash = scm_make_weak_key_hash_table (SCM_I_MAKINUM(31)); - scm_i_port_weak_hash = scm_permanent_object (scm_make_weak_key_hash_table (SCM_I_MAKINUM(31))); - #include "libguile/ports.x" + + /* Use Latin-1 as the default port encoding. */ + SCM_VARIABLE_SET (default_port_encoding_var, + scm_make_fluid_with_default (SCM_BOOL_F)); + scm_port_encoding_init = 1; + + SCM_VARIABLE_SET (default_conversion_strategy_var, + scm_make_fluid_with_default (sym_substitute)); + scm_conversion_strategy_init = 1; + + /* These bindings are used when boot-9 turns `current-input-port' et + al into parameters. They are then removed from the guile module. */ + scm_c_define ("%current-input-port-fluid", cur_inport_fluid); + scm_c_define ("%current-output-port-fluid", cur_outport_fluid); + scm_c_define ("%current-error-port-fluid", cur_errport_fluid); } /*