From f80d15c59e962d197c0cb8e98fd84bdd27bc449e Mon Sep 17 00:00:00 2001 From: Andy Wingo Date: Tue, 25 Oct 2011 17:32:50 +0200 Subject: [PATCH] optimize utf8 symbol lookup * libguile/symbols.c (utf8_string_equals_wide_string) (utf8_lookup_predicate_fn, lookup_interned_utf8_symbol): Optimize utf8 symbol lookup. --- libguile/symbols.c | 86 ++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 84 insertions(+), 2 deletions(-) diff --git a/libguile/symbols.c b/libguile/symbols.c index 498e46ce8..9cb300ab0 100644 --- a/libguile/symbols.c +++ b/libguile/symbols.c @@ -23,6 +23,8 @@ # include #endif +#include + #include "libguile/_scm.h" #include "libguile/chars.h" #include "libguile/eval.h" @@ -144,6 +146,73 @@ lookup_interned_latin1_symbol (const char *str, size_t len, &data, SCM_BOOL_F); } +struct utf8_lookup_data +{ + const char *str; + size_t len; + unsigned long string_hash; +}; + +static int +utf8_string_equals_wide_string (const scm_t_uint8 *narrow, size_t nlen, + const scm_t_wchar *wide, size_t wlen) +{ + size_t byte_idx = 0, char_idx = 0; + + while (byte_idx < nlen && char_idx < wlen) + { + ucs4_t c; + int nbytes; + + nbytes = u8_mbtouc (&c, narrow + byte_idx, nlen - byte_idx); + if (nbytes == 0) + break; + else if (nbytes < 0) + /* Bad UTF-8. */ + return 0; + else if (c != wide[char_idx]) + return 0; + + byte_idx += nbytes; + char_idx++; + } + + return byte_idx == nlen && char_idx == wlen; +} + +static int +utf8_lookup_predicate_fn (SCM sym, void *closure) +{ + struct utf8_lookup_data *data = closure; + + if (scm_i_symbol_hash (sym) != data->string_hash) + return 0; + + if (scm_i_is_narrow_symbol (sym)) + return (scm_i_symbol_length (sym) == data->len + && strncmp (scm_i_symbol_chars (sym), data->str, data->len) == 0); + else + return utf8_string_equals_wide_string ((const scm_t_uint8 *) data->str, + data->len, + scm_i_symbol_wide_chars (sym), + scm_i_symbol_length (sym)); +} + +static SCM +lookup_interned_utf8_symbol (const char *str, size_t len, + unsigned long raw_hash) +{ + struct utf8_lookup_data data; + + data.str = str; + data.len = len; + data.string_hash = raw_hash; + + return scm_c_weak_set_lookup (symbols, raw_hash, + utf8_lookup_predicate_fn, + &data, SCM_BOOL_F); +} + static int symbol_lookup_predicate_fn (SCM sym, void *closure) { @@ -459,8 +528,21 @@ scm_from_utf8_symbol (const char *sym) SCM scm_from_utf8_symboln (const char *sym, size_t len) { - SCM str = scm_from_utf8_stringn (sym, len); - return scm_i_str2symbol (str); + unsigned long hash; + SCM ret; + + if (len == (size_t) -1) + len = strlen (sym); + hash = scm_i_utf8_string_hash (sym, len); + + ret = lookup_interned_utf8_symbol (sym, len, hash); + if (scm_is_false (ret)) + { + SCM str = scm_from_utf8_stringn (sym, len); + ret = scm_i_str2symbol (str); + } + + return ret; } void -- 2.20.1