From 24d23822ee9d6a515aed8baaeff9d363fd7ec813 Mon Sep 17 00:00:00 2001 From: Michael Gran Date: Fri, 28 Aug 2009 23:47:42 -0700 Subject: [PATCH] Surrogate characters shouldn't be in charsets * libguile/srfi-14.c (charsets_complement): use surrogate #defines instead of hardcoded numbers * libguile/srfi-14.i.c (cs_full_ranges): remove surrogates from full charset * libguile/unidata_to_charset.pl (full): test for surrogates --- libguile/srfi-14.c | 5 +++-- libguile/srfi-14.i.c | 5 +++-- libguile/unidata_to_charset.pl | 10 +++++++--- 3 files changed, 13 insertions(+), 7 deletions(-) diff --git a/libguile/srfi-14.c b/libguile/srfi-14.c index 7ab65ac97..50229ef0f 100644 --- a/libguile/srfi-14.c +++ b/libguile/srfi-14.c @@ -29,6 +29,7 @@ #include "libguile.h" #include "libguile/srfi-14.h" #include "libguile/strings.h" +#include "libguile/chars.h" /* Include the pre-computed standard charset data. */ #include "libguile/srfi-14.i.c" @@ -386,8 +387,8 @@ charsets_complement (scm_t_char_set *p, scm_t_char_set *q) p->ranges = scm_gc_malloc (sizeof (scm_t_char_range) * 2, "character-set"); p->ranges[0].lo = 0; - p->ranges[0].hi = 0xd7ff; - p->ranges[1].lo = 0xe000; + p->ranges[0].hi = SCM_CODEPOINT_SURROGATE_START - 1; + p->ranges[1].lo = SCM_CODEPOINT_SURROGATE_END + 1; p->ranges[1].hi = SCM_CODEPOINT_MAX; return; } diff --git a/libguile/srfi-14.i.c b/libguile/srfi-14.i.c index 5ef21f333..d92b4d73e 100644 --- a/libguile/srfi-14.i.c +++ b/libguile/srfi-14.i.c @@ -2,7 +2,8 @@ /* This file is #include'd by srfi-14.c. */ -/* This file was generated from http://unicode.org/Public/UNIDATA/UnicodeData.txt +/* This file was generated from + http://unicode.org/Public/UNIDATA/UnicodeData.txt with the unidata_to_charset.pl script. */ scm_t_char_range cs_lower_case_ranges[] = { @@ -6925,7 +6926,7 @@ scm_t_char_range cs_full_ranges[] = { , {0xac00, 0xd7a3} , - {0xd800, 0xfa2d} + {0xe000, 0xfa2d} , {0xfa30, 0xfa6a} , diff --git a/libguile/unidata_to_charset.pl b/libguile/unidata_to_charset.pl index 6871e67ee..61c8d100e 100755 --- a/libguile/unidata_to_charset.pl +++ b/libguile/unidata_to_charset.pl @@ -254,10 +254,14 @@ sub empty { return 0; } -# Full -- All characters. +# Full -- All characters except for the surrogates sub full { my($codepoint, $name, $category, $uppercase, $lowercase)= @_; - return 1; + if ($category =~ (/Cs/)) { + return 0; + } else { + return 1; + } } @@ -362,7 +366,7 @@ sub compute { # Write a bit of a header print $out "/* srfi-14.i.c -- standard SRFI-14 character set data */\n\n"; print $out "/* This file is #include'd by srfi-14.c. */\n\n"; -print $out "/* This file was generated from\n" +print $out "/* This file was generated from\n"; print $out " http://unicode.org/Public/UNIDATA/UnicodeData.txt\n"; print $out " with the unidata_to_charset.pl script. */\n\n"; -- 2.20.1