Fix `get_utf8_codepoint' to not consume valid starting bytes.

author Ludovic Courtès <ludo@gnu.org>

Sat, 7 May 2011 20:46:38 +0000 (22:46 +0200)

committer Ludovic Courtès <ludo@gnu.org>

Sat, 7 May 2011 20:47:49 +0000 (22:47 +0200)
author Ludovic Courtès <ludo@gnu.org>
Sat, 7 May 2011 20:46:38 +0000 (22:46 +0200)
committer Ludovic Courtès <ludo@gnu.org>
Sat, 7 May 2011 20:47:49 +0000 (22:47 +0200)
diff --git a/libguile/ports.c b/libguile/ports.c

index 767e086..926149b 100644 (file)
--- a/libguile/ports.c
+++ b/libguile/ports.c
@@ -1127,10 +1127,14 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
  #define ASSERT_NOT_EOF(b)                      \
    if (SCM_UNLIKELY ((b) == EOF))               \
      goto invalid_seq
+#define CONSUME_PEEKED_BYTE()                          \
+  pt->read_pos++
  
    int byte;
+  scm_t_port *pt;
  
    *len = 0;
+  pt = SCM_PTAB_ENTRY (port);
  
    byte = scm_get_byte_or_eof (port);
    if (byte == EOF)
@@ -1148,49 +1152,44 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
    else if (buf[0] >= 0xc2 && buf[0] <= 0xdf)
      {
        /* 2-byte form.  */
-      byte = scm_get_byte_or_eof (port);
+      byte = scm_peek_byte_or_eof (port);
        ASSERT_NOT_EOF (byte);
  
-      buf[1] = (scm_t_uint8) byte;
-      *len = 2;
-
        if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
         goto invalid_seq;
  
+      CONSUME_PEEKED_BYTE ();
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
        *codepoint = ((scm_t_wchar) buf[0] & 0x1f) << 6UL
         | (buf[1] & 0x3f);
      }
    else if ((buf[0] & 0xf0) == 0xe0)
      {
        /* 3-byte form.  */
-      byte = scm_get_byte_or_eof (port);
-      if (SCM_UNLIKELY (byte == EOF))
-       goto invalid_seq;
-
-      buf[1] = (scm_t_uint8) byte;
-      *len = 2;
+      byte = scm_peek_byte_or_eof (port);
+      ASSERT_NOT_EOF (byte);
  
        if (SCM_UNLIKELY ((byte & 0xc0) != 0x80
                         || (buf[0] == 0xe0 && byte < 0xa0)
                         || (buf[0] == 0xed && byte > 0x9f)))
-       {
-         /* Swallow the 3rd byte.  */
-         byte = scm_get_byte_or_eof (port);
-         ASSERT_NOT_EOF (byte);
-         *len = 3, buf[2] = byte;
-         goto invalid_seq;
-       }
+       goto invalid_seq;
  
+      CONSUME_PEEKED_BYTE ();
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
  
-      byte = scm_get_byte_or_eof (port);
+      byte = scm_peek_byte_or_eof (port);
        ASSERT_NOT_EOF (byte);
  
-      buf[2] = (scm_t_uint8) byte;
-      *len = 3;
-
        if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
         goto invalid_seq;
  
+      CONSUME_PEEKED_BYTE ();
+      buf[2] = (scm_t_uint8) byte;
+      *len = 3;
+
        *codepoint = ((scm_t_wchar) buf[0] & 0x0f) << 12UL
         | ((scm_t_wchar) buf[1] & 0x3f) << 6UL
         | (buf[2] & 0x3f);
@@ -1198,51 +1197,38 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
    else if (buf[0] >= 0xf0 && buf[0] <= 0xf4)
      {
        /* 4-byte form.  */
-      byte = scm_get_byte_or_eof (port);
+      byte = scm_peek_byte_or_eof (port);
        ASSERT_NOT_EOF (byte);
  
-      buf[1] = (scm_t_uint8) byte;
-      *len = 2;
-
        if (SCM_UNLIKELY (((byte & 0xc0) != 0x80)
                         || (buf[0] == 0xf0 && byte < 0x90)
                         || (buf[0] == 0xf4 && byte > 0x8f)))
-       {
-         /* Swallow the 3rd and 4th bytes.  */
-         byte = scm_get_byte_or_eof (port);
-         ASSERT_NOT_EOF (byte);
-         *len = 3, buf[2] = byte;
-
-         byte = scm_get_byte_or_eof (port);
-         ASSERT_NOT_EOF (byte);
-         *len = 4, buf[3] = byte;
-         goto invalid_seq;
-       }
+       goto invalid_seq;
  
-      byte = scm_get_byte_or_eof (port);
+      CONSUME_PEEKED_BYTE ();
+      buf[1] = (scm_t_uint8) byte;
+      *len = 2;
+
+      byte = scm_peek_byte_or_eof (port);
        ASSERT_NOT_EOF (byte);
  
+      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
+       goto invalid_seq;
+
+      CONSUME_PEEKED_BYTE ();
        buf[2] = (scm_t_uint8) byte;
        *len = 3;
  
-      if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
-       {
-         /* Swallow the 4th byte.  */
-         byte = scm_get_byte_or_eof (port);
-         ASSERT_NOT_EOF (byte);
-         *len = 4, buf[3] = byte;
-         goto invalid_seq;
-       }
-
-      byte = scm_get_byte_or_eof (port);
+      byte = scm_peek_byte_or_eof (port);
        ASSERT_NOT_EOF (byte);
  
-      buf[3] = (scm_t_uint8) byte;
-      *len = 4;
-
        if (SCM_UNLIKELY ((byte & 0xc0) != 0x80))
         goto invalid_seq;
  
+      CONSUME_PEEKED_BYTE ();
+      buf[3] = (scm_t_uint8) byte;
+      *len = 4;
+
        *codepoint = ((scm_t_wchar) buf[0] & 0x07) << 18UL
         | ((scm_t_wchar) buf[1] & 0x3f) << 12UL
         | ((scm_t_wchar) buf[2] & 0x3f) << 6UL
@@ -1254,8 +1240,14 @@ get_utf8_codepoint (SCM port, scm_t_wchar *codepoint,
    return 0;
  
   invalid_seq:
+  /* Here we could choose the consume the faulty byte when it's not a
+     valid starting byte, but it's not a requirement.  What Section 3.9
+     of Unicode 6.0.0 mandates, though, is to not consume a byte that
+     would otherwise be a valid starting byte.  */
+
    return EILSEQ;
  
+#undef CONSUME_PEEKED_BYTE
  #undef ASSERT_NOT_EOF
  }
  
diff --git a/test-suite/tests/ports.test b/test-suite/tests/ports.test

index c1ee7d1..d4a333f 100644 (file)
--- a/test-suite/tests/ports.test
+++ b/test-suite/tests/ports.test
@@ -572,29 +572,40 @@
         eof))
  
      (test-decoding-error (#xc2 #x41 #x42) "UTF-8"
-      ;; FIXME: This is the behavior of glibc/libiconv but it does not
-      ;; conform to the Unicode 6.0.0 recommendation: according to it,
-      ;; the #\A should not be swallowed (Section 3.9 reads:
-      ;; "If the converter encounters an ill-formed UTF-8 code unit
-      ;; sequence which starts with a valid first byte, but which does
-      ;; not continue with valid successor bytes (see Table 3-7), it
-      ;; must not consume the successor bytes".)
-
-      (error                ;; 41: should be in the 80..BF range
+      ;; Section 3.9 of Unicode 6.0.0 reads:
+      ;;   "If the converter encounters an ill-formed UTF-8 code unit
+      ;;   sequence which starts with a valid first byte, but which does
+      ;;   not continue with valid successor bytes (see Table 3-7), it
+      ;;   must not consume the successor bytes".
+      ;; Glibc/libiconv do not conform to it and instead swallow the
+      ;; #x41.  This example appears literally in Section 3.9.
+      (error                ;; 41: invalid successor
+       #\A                  ;; 41: valid starting byte
         #\B
         eof))
  
-    (test-decoding-error (#xe0 #x88 #x88) "UTF-8"
+    (test-decoding-error (#xf0 #x80 #x80 #x41) "UTF-8"
+      ;; According to Unicode 6.0.0, Section 3.9, "the only formal
+      ;; requirement mandated by Unicode conformance for a converter is
+      ;; that the <41> be processed and correctly interpreted as
+      ;; <U+0041>".
        (error                ;; 2nd byte should be in the A0..BF range
+       error                ;; 80: not a valid starting byte
+       error                ;; 80: not a valid starting byte
+       #\A
         eof))
  
      (test-decoding-error (#xe0 #xa0 #x41 #x42) "UTF-8"
        (error                ;; 3rd byte should be in the 80..BF range
+       #\A
         #\B
         eof))
  
      (test-decoding-error (#xf0 #x88 #x88 #x88) "UTF-8"
        (error                ;; 2nd byte should be in the 90..BF range
+       error                ;; 88: not a valid starting byte
+       error                ;; 88: not a valid starting byte
+       error                ;; 88: not a valid starting byte
         eof))))
  
  (with-test-prefix "call-with-output-string"
author	Ludovic Courtès <ludo@gnu.org>
	Sat, 7 May 2011 20:46:38 +0000 (22:46 +0200)
committer	Ludovic Courtès <ludo@gnu.org>
	Sat, 7 May 2011 20:47:49 +0000 (22:47 +0200)
libguile/ports.c		patch \| blob \| blame \| history
test-suite/tests/ports.test		patch \| blob \| blame \| history