Fix scm_to_utf8_stringn to return the length in bytes, et al

author Mark H Weaver <mhw@netris.org>

Mon, 2 Apr 2012 22:55:45 +0000 (18:55 -0400)

committer Mark H Weaver <mhw@netris.org>

Mon, 2 Apr 2012 22:55:45 +0000 (18:55 -0400)
author Mark H Weaver <mhw@netris.org>
Mon, 2 Apr 2012 22:55:45 +0000 (18:55 -0400)
committer Mark H Weaver <mhw@netris.org>
Mon, 2 Apr 2012 22:55:45 +0000 (18:55 -0400)
diff --git a/libguile/strings.c b/libguile/strings.c

index a9e5afe..f4828f8 100644 (file)
--- a/libguile/strings.c
+++ b/libguile/strings.c
@@ -1874,19 +1874,28 @@ latin1_to_u8 (const scm_t_uint8 *str, size_t latin_len,
    return u8_result;
  }
  
-/* From RFC 3629:
+/* UTF-8 code table
+
+   (Note that this includes code points that are not allowed by Unicode,
+    but since this function has no way to report an error, and its
+    purpose is to determine the size of destination buffers for
+    libunicode conversion functions, we err on the safe side and handle
+    everything that libunicode might conceivably handle, now or in the
+    future.)
  
     Char. number range  |        UTF-8 octet sequence
        (hexadecimal)    |              (binary)
-   --------------------+---------------------------------------------
+   --------------------+------------------------------------------------------
     0000 0000-0000 007F | 0xxxxxxx
     0000 0080-0000 07FF | 110xxxxx 10xxxxxx
     0000 0800-0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx
-   0001 0000-0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+   0001 0000-001F FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+   0020 0000-03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+   0400 0000-7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
  */
  
  static size_t
-u32_u8_strlen (const scm_t_uint32 *str, size_t len)
+u32_u8_length_in_bytes (const scm_t_uint32 *str, size_t len)
  {
    size_t ret, i;
  
@@ -1900,8 +1909,12 @@ u32_u8_strlen (const scm_t_uint32 *str, size_t len)
          ret += 2;
        else if (c <= 0xffff)
          ret += 3;
-      else
+      else if (c <= 0x1fffff)
          ret += 4;
+      else if (c <= 0x3ffffff)
+        ret += 5;
+      else
+        ret += 6;
      }
  
    return ret;
@@ -1917,21 +1930,28 @@ scm_to_utf8_stringn (SCM str, size_t *lenp)
    else
      {
        scm_t_uint8 *buf, *ret;
-      size_t len, allocated;
+      size_t predicted_len, actual_len;  /* length in bytes */
  
-      len = u32_u8_strlen ((scm_t_uint32*)scm_i_string_wide_chars (str),
-                           scm_i_string_length (str));
-      allocated = len + 1;
-      buf = scm_malloc (allocated);
+      predicted_len = u32_u8_length_in_bytes
+        ((scm_t_uint32 *) scm_i_string_wide_chars (str),
+         scm_i_string_length (str));
  
-      ret = u32_to_u8 ((scm_t_uint32*)scm_i_string_wide_chars (str),
-                       scm_i_string_length (str), buf, &len);
-
-      if (ret == buf && len + 1 == allocated)
+      if (lenp)
          {
-          ret[len] = 0;
-          return (char *) ret;
+          *lenp = predicted_len;
+          buf = scm_malloc (predicted_len);
          }
+      else
+        {
+          buf = scm_malloc (predicted_len + 1);
+          ret[predicted_len] = 0;
+        }
+
+      ret = u32_to_u8 ((scm_t_uint32 *) scm_i_string_wide_chars (str),
+                       scm_i_string_length (str), buf, &actual_len);
+
+      if (SCM_LIKELY (ret == buf && actual_len == predicted_len))
+        return (char *) ret;
  
        /* An error: a bad codepoint.  */
        {
author	Mark H Weaver <mhw@netris.org>
	Mon, 2 Apr 2012 22:55:45 +0000 (18:55 -0400)
committer	Mark H Weaver <mhw@netris.org>
	Mon, 2 Apr 2012 22:55:45 +0000 (18:55 -0400)