Have `read-char' & co. throw to `decoding-error'.

[bpt/guile.git] / libguile / ports.c
diff --git a/libguile/ports.c b/libguile/ports.c

index 60f318c..46404be 100644 (file)
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1029,7 +1029,11 @@ SCM_DEFINE (scm_read_char, "read-char", 0, 1, 0,
             (SCM port),
             "Return the next character available from @var{port}, updating\n"
             "@var{port} to point to the following character.  If no more\n"
-           "characters are available, the end-of-file object is returned.")
+           "characters are available, the end-of-file object is returned.\n"
+           "\n"
+           "When @var{port}'s data cannot be decoded according to its\n"
+           "character encoding, a @code{decoding-error} is raised and\n"
+           "@var{port} points past the erroneous byte sequence.\n")
  #define FUNC_NAME s_scm_read_char
  {
    scm_t_wchar c;
@@ -1108,16 +1112,16 @@ utf8_to_codepoint (const scm_t_uint8 *utf8_buf, size_t size)
    return codepoint;
  }
  
-/* Read a codepoint from PORT and return it.  Fill BUF with the byte
-   representation of the codepoint in PORT's encoding, and set *LEN to
-   the length in bytes of that representation.  Raise an error on
-   failure.  */
-static scm_t_wchar
-get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
+/* Read a codepoint from PORT and return it in *CODEPOINT.  Fill BUF
+   with the byte representation of the codepoint in PORT's encoding, and
+   set *LEN to the length in bytes of that representation.  Return 0 on
+   success and an errno value on error.  */
+static int
+get_codepoint (SCM port, scm_t_wchar *codepoint,
+              char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
  {
    int err, byte_read;
    size_t bytes_consumed, output_size;
-  scm_t_wchar codepoint;
    char *output;
    scm_t_uint8 utf8_buf[SCM_MBCHAR_BUF_SIZE];
    scm_t_port *pt = SCM_PTAB_ENTRY (port);
@@ -1139,7 +1143,11 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
        if (byte_read == EOF)
         {
           if (bytes_consumed == 0)
-           return (scm_t_wchar) EOF;
+           {
+             *codepoint = (scm_t_wchar) EOF;
+             *len = 0;
+             return 0;
+           }
           else
             continue;
         }
@@ -1163,46 +1171,52 @@ get_codepoint (SCM port, char buf[SCM_MBCHAR_BUF_SIZE], size_t *len)
         output_size = sizeof (utf8_buf) - output_left;
      }
  
-  if (err != 0)
-    goto failure;
+  if (SCM_UNLIKELY (err != 0))
+    {
+      /* Reset the `iconv' state.  */
+      iconv (pt->input_cd, NULL, NULL, NULL, NULL);
  
-  /* Convert the UTF8_BUF sequence to a Unicode code point.  */
-  codepoint = utf8_to_codepoint (utf8_buf, output_size);
-  update_port_lf (codepoint, port);
+      if (pt->ilseq_handler == SCM_ICONVEH_QUESTION_MARK)
+       {
+         *codepoint = '?';
+         err = 0;
+       }
  
-  *len = bytes_consumed;
+      /* Fail when the strategy is SCM_ICONVEH_ERROR or
+        SCM_ICONVEH_ESCAPE_SEQUENCE (the latter doesn't make sense for
+        input encoding errors.)  */
+    }
+  else
+    /* Convert the UTF8_BUF sequence to a Unicode code point.  */
+    *codepoint = utf8_to_codepoint (utf8_buf, output_size);
  
-  return codepoint;
+  if (SCM_LIKELY (err == 0))
+    update_port_lf (*codepoint, port);
  
- failure:
-  {
-    char *err_buf;
-    SCM err_str = scm_i_make_string (bytes_consumed, &err_buf);
-    memcpy (err_buf, buf, bytes_consumed);
-
-    if (err == EILSEQ)
-      scm_misc_error (NULL, "input encoding error for ~s: ~s",
-                     scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
-                                 err_str));
-    else
-      scm_misc_error (NULL, "input encoding error (invalid) for ~s: ~s\n", 
-                     scm_list_2 (scm_from_locale_string (scm_i_get_port_encoding (port)),
-                                 err_str));
-  }
+  *len = bytes_consumed;
  
-  /* Never gets here.  */
-  return 0;
+  return err;
  }
  
  /* Read a codepoint from PORT and return it.  */
  scm_t_wchar
  scm_getc (SCM port)
+#define FUNC_NAME "scm_getc"
  {
+  int err;
    size_t len;
+  scm_t_wchar codepoint;
    char buf[SCM_MBCHAR_BUF_SIZE];
  
-  return get_codepoint (port, buf, &len);
+  err = get_codepoint (port, &codepoint, buf, &len);
+  if (SCM_UNLIKELY (err != 0))
+    /* At this point PORT should point past the invalid encoding, as per
+       R6RS-lib Section 8.2.4.  */
+    scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
+
+  return codepoint;
  }
+#undef FUNC_NAME
  
  /* this should only be called when the read buffer is empty.  it
     tries to refill the read buffer.  it returns the first char from
@@ -1251,38 +1265,19 @@ scm_lfwrite (const char *ptr, size_t size, SCM port)
      pt->rw_active = SCM_PORT_WRITE;
  }
  
-/* Write a scheme string STR to PORT from START inclusive to END
-   exclusive.  */
+/* Write STR to PORT from START inclusive to END exclusive.  */
  void
  scm_lfwrite_substr (SCM str, size_t start, size_t end, SCM port)
  {
-  size_t i, size = scm_i_string_length (str);
    scm_t_port *pt = SCM_PTAB_ENTRY (port);
-  scm_t_ptob_descriptor *ptob = &scm_ptobs[SCM_PTOBNUM (port)];
-  scm_t_wchar p;
-  char *buf;
-  size_t len;
  
    if (pt->rw_active == SCM_PORT_READ)
      scm_end_input (port);
  
-  if (end == (size_t) (-1))
-    end = size;
-  size = end - start;
+  if (end == (size_t) -1)
+    end = scm_i_string_length (str);
  
-  /* Note that making a substring will likely take the
-     stringbuf_write_mutex.  So, one shouldn't use scm_lfwrite_substr
-     if the stringbuf write mutex may still be held elsewhere.  */
-  buf = scm_to_stringn (scm_c_substring (str, start, end), &len,
-                       pt->encoding, pt->ilseq_handler);
-  ptob->write (port, buf, len);
-  free (buf);
-
-  for (i = 0; i < size; i++)
-    {
-      p = scm_i_string_ref (str, i + start);
-      update_port_lf (p, port);
-    }
+  scm_display (scm_c_substring (str, start, end), port);
  
    if (pt->rw_random)
      pt->rw_active = SCM_PORT_WRITE;
@@ -1634,13 +1629,19 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
             "return the value returned by the preceding call to\n"
             "@code{peek-char}.  In particular, a call to @code{peek-char} on\n"
             "an interactive port will hang waiting for input whenever a call\n"
-           "to @code{read-char} would have hung.")
+           "to @code{read-char} would have hung.\n"
+           "\n"
+           "As for @code{read-char}, a @code{decoding-error} may be raised\n"
+           "if such a situation occurs.  However, unlike with @code{read-char},\n"
+           "@var{port} still points at the beginning of the erroneous byte\n"
+           "sequence when the error is raised.\n")
  #define FUNC_NAME s_scm_peek_char
  {
+  int err;
    SCM result;
    scm_t_wchar c;
    char bytes[SCM_MBCHAR_BUF_SIZE];
-  long column, line;
+  long column, line, i;
    size_t len;
  
    if (SCM_UNBNDP (port))
@@ -1650,21 +1651,25 @@ SCM_DEFINE (scm_peek_char, "peek-char", 0, 1, 0,
    column = SCM_COL (port);
    line = SCM_LINUM (port);
  
-  c = get_codepoint (port, bytes, &len);
-  if (c == EOF)
-    result = SCM_EOF_VAL;
-  else
-    {
-      long i;
+  err = get_codepoint (port, &c, bytes, &len);
+
+  for (i = len - 1; i >= 0; i--)
+    scm_unget_byte (bytes[i], port);
  
-      result = SCM_MAKE_CHAR (c);
+  SCM_COL (port) = column;
+  SCM_LINUM (port) = line;
  
-      for (i = len - 1; i >= 0; i--)
-       scm_unget_byte (bytes[i], port);
+  if (SCM_UNLIKELY (err != 0))
+    {
+      scm_decoding_error (FUNC_NAME, err, "input decoding error", port);
  
-      SCM_COL (port) = column;
-      SCM_LINUM (port) = line;
+      /* Shouldn't happen since `catch' always aborts to prompt.  */
+      result = SCM_BOOL_F;
      }
+  else if (c == EOF)
+    result = SCM_EOF_VAL;
+  else
+    result = SCM_MAKE_CHAR (c);
  
    return result;
  }
@@ -1996,10 +2001,7 @@ scm_i_get_port_encoding (SCM port)
      {
        scm_t_port *pt;
        pt = SCM_PTAB_ENTRY (port);
-      if (pt->encoding)
-       return pt->encoding;
-      else
-       return NULL;
+      return pt->encoding;
      }
  }
  
@@ -2071,14 +2073,12 @@ scm_i_set_port_encoding_x (SCM port, const char *enc)
  
        /* Set the character encoding for this port.  */
        pt = SCM_PTAB_ENTRY (port);
-      if (valid_enc == NULL)
-        pt->encoding = NULL;
-      else
-        pt->encoding = scm_gc_strdup (valid_enc, "port");
  
        if (valid_enc == NULL)
         valid_enc = "ISO-8859-1";
  
+      pt->encoding = scm_gc_strdup (valid_enc, "port");
+
        if (SCM_CELL_WORD_0 (port) & SCM_RDNG)
         {
           /* Open an input iconv conversion descriptor, from VALID_ENC
@@ -2152,24 +2152,13 @@ SCM_DEFINE (scm_set_port_encoding_x, "set-port-encoding!", 2, 0, 0,
  #define FUNC_NAME s_scm_set_port_encoding_x
  {
    char *enc_str;
-  const char *valid_enc_str;
  
    SCM_VALIDATE_PORT (1, port);
    SCM_VALIDATE_STRING (2, enc);
  
    enc_str = scm_to_locale_string (enc);
-  valid_enc_str = find_valid_encoding (enc_str);
-  if (valid_enc_str == NULL)
-    {
-      free (enc_str);
-      scm_misc_error (FUNC_NAME, "invalid or unknown character encoding ~s",
-                     scm_list_1 (enc));
-    }
-  else
-    {
-      scm_i_set_port_encoding_x (port, valid_enc_str);
-      free (enc_str);
-    }
+  scm_i_set_port_encoding_x (port, enc_str);
+
    return SCM_UNSPECIFIED;
  }
  #undef FUNC_NAME