Merge remote-tracking branch 'origin/stable-2.0'
[bpt/guile.git] / libguile / hash.c
1 /* Copyright (C) 1995,1996,1997, 2000, 2001, 2003, 2004, 2006, 2008, 2009, 2010, 2011 Free Software Foundation, Inc.
2 *
3 * This library is free software; you can redistribute it and/or
4 * modify it under the terms of the GNU Lesser General Public License
5 * as published by the Free Software Foundation; either version 3 of
6 * the License, or (at your option) any later version.
7 *
8 * This library is distributed in the hope that it will be useful, but
9 * WITHOUT ANY WARRANTY; without even the implied warranty of
10 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
11 * Lesser General Public License for more details.
12 *
13 * You should have received a copy of the GNU Lesser General Public
14 * License along with this library; if not, write to the Free Software
15 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
16 * 02110-1301 USA
17 */
18
19
20 \f
21 #ifdef HAVE_CONFIG_H
22 # include <config.h>
23 #endif
24
25 #ifdef HAVE_WCHAR_H
26 #include <wchar.h>
27 #endif
28
29 #include <math.h>
30 #include <unistr.h>
31
32 #include "libguile/_scm.h"
33 #include "libguile/chars.h"
34 #include "libguile/ports.h"
35 #include "libguile/strings.h"
36 #include "libguile/symbols.h"
37 #include "libguile/vectors.h"
38
39 #include "libguile/validate.h"
40 #include "libguile/hash.h"
41 \f
42
43 #ifndef floor
44 extern double floor();
45 #endif
46
47
48 /* This hash function is originally from
49 http://burtleburtle.net/bob/c/lookup3.c by Bob Jenkins, May 2006,
50 Public Domain. No warranty. */
51
52 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
53 #define mix(a,b,c) \
54 { \
55 a -= c; a ^= rot(c, 4); c += b; \
56 b -= a; b ^= rot(a, 6); a += c; \
57 c -= b; c ^= rot(b, 8); b += a; \
58 a -= c; a ^= rot(c,16); c += b; \
59 b -= a; b ^= rot(a,19); a += c; \
60 c -= b; c ^= rot(b, 4); b += a; \
61 }
62
63 #define final(a,b,c) \
64 { \
65 c ^= b; c -= rot(b,14); \
66 a ^= c; a -= rot(c,11); \
67 b ^= a; b -= rot(a,25); \
68 c ^= b; c -= rot(b,16); \
69 a ^= c; a -= rot(c,4); \
70 b ^= a; b -= rot(a,14); \
71 c ^= b; c -= rot(b,24); \
72 }
73
74 #define JENKINS_LOOKUP3_HASHWORD2(k, length, ret) \
75 do { \
76 scm_t_uint32 a, b, c; \
77 \
78 /* Set up the internal state. */ \
79 a = b = c = 0xdeadbeef + ((scm_t_uint32)(length<<2)) + 47; \
80 \
81 /* Handle most of the key. */ \
82 while (length > 3) \
83 { \
84 a += k[0]; \
85 b += k[1]; \
86 c += k[2]; \
87 mix (a, b, c); \
88 length -= 3; \
89 k += 3; \
90 } \
91 \
92 /* Handle the last 3 elements. */ \
93 switch(length) /* All the case statements fall through. */ \
94 { \
95 case 3 : c += k[2]; \
96 case 2 : b += k[1]; \
97 case 1 : a += k[0]; \
98 final (a, b, c); \
99 case 0: /* case 0: nothing left to add */ \
100 break; \
101 } \
102 \
103 if (sizeof (ret) == 8) \
104 ret = (((unsigned long) c) << 32) | b; \
105 else \
106 ret = c; \
107 } while (0)
108
109
110 static unsigned long
111 narrow_string_hash (const scm_t_uint8 *str, size_t len)
112 {
113 unsigned long ret;
114 JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
115 ret >>= 2; /* Ensure that it fits in a fixnum. */
116 return ret;
117 }
118
119 static unsigned long
120 wide_string_hash (const scm_t_wchar *str, size_t len)
121 {
122 unsigned long ret;
123 JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
124 ret >>= 2; /* Ensure that it fits in a fixnum. */
125 return ret;
126 }
127
128 unsigned long
129 scm_string_hash (const unsigned char *str, size_t len)
130 {
131 return narrow_string_hash (str, len);
132 }
133
134 unsigned long
135 scm_i_string_hash (SCM str)
136 {
137 size_t len = scm_i_string_length (str);
138
139 if (scm_i_is_narrow_string (str))
140 return narrow_string_hash ((const scm_t_uint8 *) scm_i_string_chars (str),
141 len);
142 else
143 return wide_string_hash (scm_i_string_wide_chars (str), len);
144 }
145
146 unsigned long
147 scm_i_locale_string_hash (const char *str, size_t len)
148 {
149 return scm_i_string_hash (scm_from_locale_stringn (str, len));
150 }
151
152 unsigned long
153 scm_i_latin1_string_hash (const char *str, size_t len)
154 {
155 if (len == (size_t) -1)
156 len = strlen (str);
157
158 return narrow_string_hash ((const scm_t_uint8 *) str, len);
159 }
160
161 /* A tricky optimization, but probably worth it. */
162 unsigned long
163 scm_i_utf8_string_hash (const char *str, size_t len)
164 {
165 const scm_t_uint8 *end, *ustr = (const scm_t_uint8 *) str;
166 unsigned long ret;
167
168 /* The length of the string in characters. This name corresponds to
169 Jenkins' original name. */
170 size_t length;
171
172 scm_t_uint32 a, b, c, u32;
173
174 if (len == (size_t) -1)
175 len = strlen (str);
176
177 end = ustr + len;
178
179 if (u8_check (ustr, len) != NULL)
180 /* Invalid UTF-8; punt. */
181 return scm_i_string_hash (scm_from_utf8_stringn (str, len));
182
183 length = u8_strnlen (ustr, len);
184
185 /* Set up the internal state. */
186 a = b = c = 0xdeadbeef + ((scm_t_uint32)(length<<2)) + 47;
187
188 /* Handle most of the key. */
189 while (length > 3)
190 {
191 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
192 a += u32;
193 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
194 b += u32;
195 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
196 c += u32;
197 mix (a, b, c);
198 length -= 3;
199 }
200
201 /* Handle the last 3 elements's. */
202 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
203 a += u32;
204 if (--length)
205 {
206 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
207 b += u32;
208 if (--length)
209 {
210 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
211 c += u32;
212 }
213 }
214
215 final (a, b, c);
216
217 if (sizeof (unsigned long) == 8)
218 ret = (((unsigned long) c) << 32) | b;
219 else
220 ret = c;
221
222 ret >>= 2; /* Ensure that it fits in a fixnum. */
223 return ret;
224 }
225
226
227 /* Thomas Wang's integer hasher, from
228 http://www.cris.com/~Ttwang/tech/inthash.htm. */
229 static unsigned long
230 scm_raw_ihashq (scm_t_bits key)
231 {
232 if (sizeof (key) < 8)
233 {
234 key = (key ^ 61) ^ (key >> 16);
235 key = key + (key << 3);
236 key = key ^ (key >> 4);
237 key = key * 0x27d4eb2d;
238 key = key ^ (key >> 15);
239 }
240 else
241 {
242 key = (~key) + (key << 21); // key = (key << 21) - key - 1;
243 key = key ^ (key >> 24);
244 key = (key + (key << 3)) + (key << 8); // key * 265
245 key = key ^ (key >> 14);
246 key = (key + (key << 2)) + (key << 4); // key * 21
247 key = key ^ (key >> 28);
248 key = key + (key << 31);
249 }
250 key >>= 2; /* Ensure that it fits in a fixnum. */
251 return key;
252 }
253
254 /* `depth' is used to limit recursion. */
255 static unsigned long
256 scm_raw_ihash (SCM obj, size_t depth)
257 {
258 if (SCM_IMP (obj))
259 return scm_raw_ihashq (SCM_UNPACK (obj));
260
261 switch (SCM_TYP7(obj))
262 {
263 /* FIXME: do better for structs, variables, ... Also the hashes
264 are currently associative, which ain't the right thing. */
265 case scm_tc7_smob:
266 return scm_raw_ihashq (SCM_TYP16 (obj));
267 case scm_tc7_number:
268 if (scm_is_integer (obj))
269 {
270 SCM n = SCM_I_MAKINUM (SCM_MOST_POSITIVE_FIXNUM);
271 if (scm_is_inexact (obj))
272 obj = scm_inexact_to_exact (obj);
273 return scm_raw_ihashq (scm_to_ulong (scm_modulo (obj, n)));
274 }
275 else
276 return scm_i_string_hash (scm_number_to_string (obj, scm_from_int (10)));
277 case scm_tc7_string:
278 return scm_i_string_hash (obj);
279 case scm_tc7_symbol:
280 return scm_i_symbol_hash (obj);
281 case scm_tc7_pointer:
282 return scm_raw_ihashq ((scm_t_uintptr) SCM_POINTER_VALUE (obj));
283 case scm_tc7_wvect:
284 case scm_tc7_vector:
285 {
286 size_t len = SCM_SIMPLE_VECTOR_LENGTH (obj);
287 size_t i = depth / 2;
288 unsigned long h = scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
289 if (len)
290 while (i--)
291 h ^= scm_raw_ihash (scm_c_vector_ref (obj, h % len), i);
292 return h;
293 }
294 case scm_tcs_cons_imcar:
295 case scm_tcs_cons_nimcar:
296 if (depth)
297 return (scm_raw_ihash (SCM_CAR (obj), depth / 2)
298 ^ scm_raw_ihash (SCM_CDR (obj), depth / 2));
299 else
300 return scm_raw_ihashq (scm_tc3_cons);
301 default:
302 return scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
303 }
304 }
305
306
307 \f
308
309 unsigned long
310 scm_ihashq (SCM obj, unsigned long n)
311 {
312 return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
313 }
314
315
316 SCM_DEFINE (scm_hashq, "hashq", 2, 0, 0,
317 (SCM key, SCM size),
318 "Determine a hash value for @var{key} that is suitable for\n"
319 "lookups in a hashtable of size @var{size}, where @code{eq?} is\n"
320 "used as the equality predicate. The function returns an\n"
321 "integer in the range 0 to @var{size} - 1. Note that\n"
322 "@code{hashq} may use internal addresses. Thus two calls to\n"
323 "hashq where the keys are @code{eq?} are not guaranteed to\n"
324 "deliver the same value if the key object gets garbage collected\n"
325 "in between. This can happen, for example with symbols:\n"
326 "@code{(hashq 'foo n) (gc) (hashq 'foo n)} may produce two\n"
327 "different values, since @code{foo} will be garbage collected.")
328 #define FUNC_NAME s_scm_hashq
329 {
330 unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
331 return scm_from_ulong (scm_ihashq (key, sz));
332 }
333 #undef FUNC_NAME
334
335
336 \f
337
338
339 unsigned long
340 scm_ihashv (SCM obj, unsigned long n)
341 {
342 if (SCM_NUMP(obj))
343 return scm_raw_ihash (obj, 10) % n;
344 else
345 return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
346 }
347
348
349 SCM_DEFINE (scm_hashv, "hashv", 2, 0, 0,
350 (SCM key, SCM size),
351 "Determine a hash value for @var{key} that is suitable for\n"
352 "lookups in a hashtable of size @var{size}, where @code{eqv?} is\n"
353 "used as the equality predicate. The function returns an\n"
354 "integer in the range 0 to @var{size} - 1. Note that\n"
355 "@code{(hashv key)} may use internal addresses. Thus two calls\n"
356 "to hashv where the keys are @code{eqv?} are not guaranteed to\n"
357 "deliver the same value if the key object gets garbage collected\n"
358 "in between. This can happen, for example with symbols:\n"
359 "@code{(hashv 'foo n) (gc) (hashv 'foo n)} may produce two\n"
360 "different values, since @code{foo} will be garbage collected.")
361 #define FUNC_NAME s_scm_hashv
362 {
363 unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
364 return scm_from_ulong (scm_ihashv (key, sz));
365 }
366 #undef FUNC_NAME
367
368
369 \f
370
371
372 unsigned long
373 scm_ihash (SCM obj, unsigned long n)
374 {
375 return (unsigned long) scm_raw_ihash (obj, 10) % n;
376 }
377
378 SCM_DEFINE (scm_hash, "hash", 2, 0, 0,
379 (SCM key, SCM size),
380 "Determine a hash value for @var{key} that is suitable for\n"
381 "lookups in a hashtable of size @var{size}, where @code{equal?}\n"
382 "is used as the equality predicate. The function returns an\n"
383 "integer in the range 0 to @var{size} - 1.")
384 #define FUNC_NAME s_scm_hash
385 {
386 unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
387 return scm_from_ulong (scm_ihash (key, sz));
388 }
389 #undef FUNC_NAME
390
391
392 \f
393
394
395 void
396 scm_init_hash ()
397 {
398 #include "libguile/hash.x"
399 }
400
401
402 /*
403 Local Variables:
404 c-file-style: "gnu"
405 End:
406 */