From 8c76a8971ba92ebdf657199b74506f607987b523 Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Tue, 15 Jan 2013 11:01:10 +0100 Subject: [PATCH] fix bug where scm_from_utf8_stringn would not detect bad utf-8 * libguile/strings.c (scm_from_utf8_stringn): * libguile/symbols.c (utf8_string_equals_wide_string): The "bad UTF8" return from u8_mbtouc is a 0xfffd character, not a negative byte length. Fixes a bug in which invalid UTF-8 would not be caught. * libguile/bytevectors.c (scm_utf8_to_string): Use scm_from_utf8_stringn directly. Just a little cleanup. * test-suite/tests/iconv.test ("narrow non-ascii string"): Add test for parsing bad utf-8 with substitution. --- libguile/bytevectors.c | 5 ++--- libguile/strings.c | 7 ++++--- libguile/symbols.c | 4 ++-- test-suite/tests/iconv.test | 5 +++++ 4 files changed, 13 insertions(+), 8 deletions(-) diff --git a/libguile/bytevectors.c b/libguile/bytevectors.c index db132d43f..4ce90ebdf 100644 --- a/libguile/bytevectors.c +++ b/libguile/bytevectors.c @@ -1,4 +1,4 @@ -/* Copyright (C) 2009, 2010, 2011, 2012 Free Software Foundation, Inc. +/* Copyright (C) 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -2050,8 +2050,7 @@ SCM_DEFINE (scm_utf8_to_string, "utf8->string", c_utf_len = SCM_BYTEVECTOR_LENGTH (utf); c_utf = (char *) SCM_BYTEVECTOR_CONTENTS (utf); - str = scm_from_stringn (c_utf, c_utf_len, "UTF-8", - SCM_FAILED_CONVERSION_ERROR); + str = scm_from_utf8_stringn (c_utf, c_utf_len); return (str); } diff --git a/libguile/strings.c b/libguile/strings.c index 5130cb362..1e89e63d6 100644 --- a/libguile/strings.c +++ b/libguile/strings.c @@ -1,4 +1,4 @@ -/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012 Free Software Foundation, Inc. +/* Copyright (C) 1995,1996,1998,2000,2001, 2004, 2006, 2008, 2009, 2010, 2011, 2012, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -1526,7 +1526,8 @@ scm_from_stringn (const char *str, size_t len, const char *encoding, if (encoding == NULL || len == 0) return scm_from_latin1_stringn (str, len); - else if (strcmp (encoding, "UTF-8") == 0) + else if (strcmp (encoding, "UTF-8") == 0 + && handler == SCM_FAILED_CONVERSION_ERROR) return scm_from_utf8_stringn (str, len); u32len = 0; @@ -1639,7 +1640,7 @@ scm_from_utf8_stringn (const char *str, size_t len) nbytes = u8_mbtouc (&c, ustr + i, len - i); - if (nbytes < 0) + if (c == 0xfffd) /* Bad UTF-8. */ decoding_error (__func__, errno, str, len); diff --git a/libguile/symbols.c b/libguile/symbols.c index fd7e21470..f93833b9d 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -1,5 +1,5 @@ /* Copyright (C) 1995, 1996, 1997, 1998, 2000, 2001, 2003, 2004, - * 2006, 2009, 2011 Free Software Foundation, Inc. + * 2006, 2009, 2011, 2013 Free Software Foundation, Inc. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public License @@ -167,7 +167,7 @@ utf8_string_equals_wide_string (const scm_t_uint8 *narrow, size_t nlen, nbytes = u8_mbtouc (&c, narrow + byte_idx, nlen - byte_idx); if (nbytes == 0) break; - else if (nbytes < 0) + else if (c == 0xfffd) /* Bad UTF-8. */ return 0; else if (c != wide[char_idx]) diff --git a/test-suite/tests/iconv.test b/test-suite/tests/iconv.test index 9083cd256..be36336f3 100644 --- a/test-suite/tests/iconv.test +++ b/test-suite/tests/iconv.test @@ -94,6 +94,11 @@ (pass-if-exception "misparse latin1 as utf8" exception:decoding-error (bytevector->string (string->bytevector s "latin1") "utf-8")) + (pass-if "misparse latin1 as utf8 with substitutions" + (equal? (bytevector->string (string->bytevector s "latin1") + "utf-8" 'substitute) + "?t?")) + (pass-if-exception "misparse latin1 as ascii" exception:decoding-error (bytevector->string (string->bytevector s "latin1") "ascii")))) -- 2.20.1