Convert emit-linear-dispatch to use match
[bpt/guile.git] / libguile / hash.c
1 /* Copyright (C) 1995, 1996, 1997, 2000, 2001, 2003, 2004, 2006, 2008,
2 * 2009, 2010, 2011, 2012, 2014, 2015 Free Software Foundation, Inc.
3 *
4 * This library is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU Lesser General Public License
6 * as published by the Free Software Foundation; either version 3 of
7 * the License, or (at your option) any later version.
8 *
9 * This library is distributed in the hope that it will be useful, but
10 * WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * Lesser General Public License for more details.
13 *
14 * You should have received a copy of the GNU Lesser General Public
15 * License along with this library; if not, write to the Free Software
16 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
17 * 02110-1301 USA
18 */
19
20
21 \f
22 #ifdef HAVE_CONFIG_H
23 # include <config.h>
24 #endif
25
26 #ifdef HAVE_WCHAR_H
27 #include <wchar.h>
28 #endif
29
30 #include <math.h>
31 #include <unistr.h>
32
33 #include "libguile/_scm.h"
34 #include "libguile/chars.h"
35 #include "libguile/ports.h"
36 #include "libguile/strings.h"
37 #include "libguile/symbols.h"
38 #include "libguile/vectors.h"
39
40 #include "libguile/validate.h"
41 #include "libguile/hash.h"
42 \f
43
44 #ifndef floor
45 extern double floor();
46 #endif
47
48
49 /* This hash function is originally from
50 http://burtleburtle.net/bob/c/lookup3.c by Bob Jenkins, May 2006,
51 Public Domain. No warranty. */
52
53 #define rot(x,k) (((x)<<(k)) | ((x)>>(32-(k))))
54 #define mix(a,b,c) \
55 { \
56 a -= c; a ^= rot(c, 4); c += b; \
57 b -= a; b ^= rot(a, 6); a += c; \
58 c -= b; c ^= rot(b, 8); b += a; \
59 a -= c; a ^= rot(c,16); c += b; \
60 b -= a; b ^= rot(a,19); a += c; \
61 c -= b; c ^= rot(b, 4); b += a; \
62 }
63
64 #define final(a,b,c) \
65 { \
66 c ^= b; c -= rot(b,14); \
67 a ^= c; a -= rot(c,11); \
68 b ^= a; b -= rot(a,25); \
69 c ^= b; c -= rot(b,16); \
70 a ^= c; a -= rot(c,4); \
71 b ^= a; b -= rot(a,14); \
72 c ^= b; c -= rot(b,24); \
73 }
74
75 #define JENKINS_LOOKUP3_HASHWORD2(k, length, ret) \
76 do { \
77 scm_t_uint32 a, b, c; \
78 \
79 /* Set up the internal state. */ \
80 a = b = c = 0xdeadbeef + ((scm_t_uint32)(length<<2)) + 47; \
81 \
82 /* Handle most of the key. */ \
83 while (length > 3) \
84 { \
85 a += k[0]; \
86 b += k[1]; \
87 c += k[2]; \
88 mix (a, b, c); \
89 length -= 3; \
90 k += 3; \
91 } \
92 \
93 /* Handle the last 3 elements. */ \
94 switch(length) /* All the case statements fall through. */ \
95 { \
96 case 3 : c += k[2]; \
97 case 2 : b += k[1]; \
98 case 1 : a += k[0]; \
99 final (a, b, c); \
100 case 0: /* case 0: nothing left to add */ \
101 break; \
102 } \
103 \
104 if (sizeof (ret) == 8) \
105 ret = (((unsigned long) c) << 32) | b; \
106 else \
107 ret = c; \
108 } while (0)
109
110
111 static unsigned long
112 narrow_string_hash (const scm_t_uint8 *str, size_t len)
113 {
114 unsigned long ret;
115 JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
116 ret >>= 2; /* Ensure that it fits in a fixnum. */
117 return ret;
118 }
119
120 static unsigned long
121 wide_string_hash (const scm_t_wchar *str, size_t len)
122 {
123 unsigned long ret;
124 JENKINS_LOOKUP3_HASHWORD2 (str, len, ret);
125 ret >>= 2; /* Ensure that it fits in a fixnum. */
126 return ret;
127 }
128
129 unsigned long
130 scm_i_string_hash (SCM str)
131 {
132 size_t len = scm_i_string_length (str);
133
134 if (scm_i_is_narrow_string (str))
135 return narrow_string_hash ((const scm_t_uint8 *) scm_i_string_chars (str),
136 len);
137 else
138 return wide_string_hash (scm_i_string_wide_chars (str), len);
139 }
140
141 unsigned long
142 scm_i_locale_string_hash (const char *str, size_t len)
143 {
144 return scm_i_string_hash (scm_from_locale_stringn (str, len));
145 }
146
147 unsigned long
148 scm_i_latin1_string_hash (const char *str, size_t len)
149 {
150 if (len == (size_t) -1)
151 len = strlen (str);
152
153 return narrow_string_hash ((const scm_t_uint8 *) str, len);
154 }
155
156 /* A tricky optimization, but probably worth it. */
157 unsigned long
158 scm_i_utf8_string_hash (const char *str, size_t len)
159 {
160 const scm_t_uint8 *end, *ustr = (const scm_t_uint8 *) str;
161 unsigned long ret;
162
163 /* The length of the string in characters. This name corresponds to
164 Jenkins' original name. */
165 size_t length;
166
167 scm_t_uint32 a, b, c, u32;
168
169 if (len == (size_t) -1)
170 len = strlen (str);
171
172 end = ustr + len;
173
174 if (u8_check (ustr, len) != NULL)
175 /* Invalid UTF-8; punt. */
176 return scm_i_string_hash (scm_from_utf8_stringn (str, len));
177
178 length = u8_strnlen (ustr, len);
179
180 /* Set up the internal state. */
181 a = b = c = 0xdeadbeef + ((scm_t_uint32)(length<<2)) + 47;
182
183 /* Handle most of the key. */
184 while (length > 3)
185 {
186 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
187 a += u32;
188 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
189 b += u32;
190 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
191 c += u32;
192 mix (a, b, c);
193 length -= 3;
194 }
195
196 /* Handle the last 3 elements's. */
197 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
198 a += u32;
199 if (--length)
200 {
201 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
202 b += u32;
203 if (--length)
204 {
205 ustr += u8_mbtouc_unsafe (&u32, ustr, end - ustr);
206 c += u32;
207 }
208 }
209
210 final (a, b, c);
211
212 if (sizeof (unsigned long) == 8)
213 ret = (((unsigned long) c) << 32) | b;
214 else
215 ret = c;
216
217 ret >>= 2; /* Ensure that it fits in a fixnum. */
218 return ret;
219 }
220
221 static unsigned long scm_raw_ihashq (scm_t_bits key);
222 static unsigned long scm_raw_ihash (SCM obj, size_t depth);
223
224 /* Return the hash of struct OBJ. Traverse OBJ's fields to compute the
225 result, unless DEPTH is zero. Assumes that OBJ is a struct. */
226 static unsigned long
227 scm_i_struct_hash (SCM obj, size_t depth)
228 {
229 SCM layout;
230 scm_t_bits *data;
231 size_t struct_size, field_num;
232 unsigned long hash;
233
234 layout = SCM_STRUCT_LAYOUT (obj);
235 struct_size = scm_i_symbol_length (layout) / 2;
236 data = SCM_STRUCT_DATA (obj);
237
238 hash = scm_raw_ihashq (SCM_UNPACK (SCM_STRUCT_VTABLE (obj)));
239 if (depth > 0)
240 for (field_num = 0; field_num < struct_size; field_num++)
241 {
242 int protection;
243
244 protection = scm_i_symbol_ref (layout, field_num * 2 + 1);
245 if (protection != 'h' && protection != 'o')
246 {
247 int type;
248 type = scm_i_symbol_ref (layout, field_num * 2);
249 switch (type)
250 {
251 case 'p':
252 hash ^= scm_raw_ihash (SCM_PACK (data[field_num]),
253 depth / 2);
254 break;
255 case 'u':
256 hash ^= scm_raw_ihashq (data[field_num]);
257 break;
258 default:
259 /* Ignore 's' fields. */;
260 }
261 }
262 }
263
264 /* FIXME: Tail elements should be taken into account. */
265
266 return hash;
267 }
268
269 /* Thomas Wang's integer hasher, from
270 http://www.cris.com/~Ttwang/tech/inthash.htm. */
271 static unsigned long
272 scm_raw_ihashq (scm_t_bits key)
273 {
274 if (sizeof (key) < 8)
275 {
276 key = (key ^ 61) ^ (key >> 16);
277 key = key + (key << 3);
278 key = key ^ (key >> 4);
279 key = key * 0x27d4eb2d;
280 key = key ^ (key >> 15);
281 }
282 else
283 {
284 key = (~key) + (key << 21); // key = (key << 21) - key - 1;
285 key = key ^ (key >> 24);
286 key = (key + (key << 3)) + (key << 8); // key * 265
287 key = key ^ (key >> 14);
288 key = (key + (key << 2)) + (key << 4); // key * 21
289 key = key ^ (key >> 28);
290 key = key + (key << 31);
291 }
292 key >>= 2; /* Ensure that it fits in a fixnum. */
293 return key;
294 }
295
296 /* `depth' is used to limit recursion. */
297 static unsigned long
298 scm_raw_ihash (SCM obj, size_t depth)
299 {
300 if (SCM_IMP (obj))
301 return scm_raw_ihashq (SCM_UNPACK (obj));
302
303 switch (SCM_TYP7(obj))
304 {
305 /* FIXME: do better for structs, variables, ... Also the hashes
306 are currently associative, which ain't the right thing. */
307 case scm_tc7_smob:
308 return scm_raw_ihashq (SCM_TYP16 (obj));
309 case scm_tc7_number:
310 if (scm_is_integer (obj))
311 {
312 SCM n = SCM_I_MAKINUM (SCM_MOST_POSITIVE_FIXNUM);
313 if (scm_is_inexact (obj))
314 obj = scm_inexact_to_exact (obj);
315 return scm_raw_ihashq (scm_to_ulong (scm_modulo (obj, n)));
316 }
317 else
318 return scm_i_string_hash (scm_number_to_string (obj, scm_from_int (10)));
319 case scm_tc7_string:
320 return scm_i_string_hash (obj);
321 case scm_tc7_symbol:
322 return scm_i_symbol_hash (obj);
323 case scm_tc7_pointer:
324 return scm_raw_ihashq ((scm_t_uintptr) SCM_POINTER_VALUE (obj));
325 case scm_tc7_wvect:
326 case scm_tc7_vector:
327 {
328 size_t len = SCM_SIMPLE_VECTOR_LENGTH (obj);
329 size_t i = depth / 2;
330 unsigned long h = scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
331 if (len)
332 while (i--)
333 h ^= scm_raw_ihash (scm_c_vector_ref (obj, h % len), i);
334 return h;
335 }
336 case scm_tcs_cons_imcar:
337 case scm_tcs_cons_nimcar:
338 if (depth)
339 return (scm_raw_ihash (SCM_CAR (obj), depth / 2)
340 ^ scm_raw_ihash (SCM_CDR (obj), depth / 2));
341 else
342 return scm_raw_ihashq (scm_tc3_cons);
343 case scm_tcs_struct:
344 return scm_i_struct_hash (obj, depth);
345 default:
346 return scm_raw_ihashq (SCM_CELL_WORD_0 (obj));
347 }
348 }
349
350
351 \f
352
353 unsigned long
354 scm_ihashq (SCM obj, unsigned long n)
355 {
356 return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
357 }
358
359
360 SCM_DEFINE (scm_hashq, "hashq", 2, 0, 0,
361 (SCM key, SCM size),
362 "Determine a hash value for @var{key} that is suitable for\n"
363 "lookups in a hashtable of size @var{size}, where @code{eq?} is\n"
364 "used as the equality predicate. The function returns an\n"
365 "integer in the range 0 to @var{size} - 1. Note that\n"
366 "@code{hashq} may use internal addresses. Thus two calls to\n"
367 "hashq where the keys are @code{eq?} are not guaranteed to\n"
368 "deliver the same value if the key object gets garbage collected\n"
369 "in between. This can happen, for example with symbols:\n"
370 "@code{(hashq 'foo n) (gc) (hashq 'foo n)} may produce two\n"
371 "different values, since @code{foo} will be garbage collected.")
372 #define FUNC_NAME s_scm_hashq
373 {
374 unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
375 return scm_from_ulong (scm_ihashq (key, sz));
376 }
377 #undef FUNC_NAME
378
379
380 \f
381
382
383 unsigned long
384 scm_ihashv (SCM obj, unsigned long n)
385 {
386 if (SCM_NUMP(obj))
387 return scm_raw_ihash (obj, 10) % n;
388 else
389 return scm_raw_ihashq (SCM_UNPACK (obj)) % n;
390 }
391
392
393 SCM_DEFINE (scm_hashv, "hashv", 2, 0, 0,
394 (SCM key, SCM size),
395 "Determine a hash value for @var{key} that is suitable for\n"
396 "lookups in a hashtable of size @var{size}, where @code{eqv?} is\n"
397 "used as the equality predicate. The function returns an\n"
398 "integer in the range 0 to @var{size} - 1. Note that\n"
399 "@code{(hashv key)} may use internal addresses. Thus two calls\n"
400 "to hashv where the keys are @code{eqv?} are not guaranteed to\n"
401 "deliver the same value if the key object gets garbage collected\n"
402 "in between. This can happen, for example with symbols:\n"
403 "@code{(hashv 'foo n) (gc) (hashv 'foo n)} may produce two\n"
404 "different values, since @code{foo} will be garbage collected.")
405 #define FUNC_NAME s_scm_hashv
406 {
407 unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
408 return scm_from_ulong (scm_ihashv (key, sz));
409 }
410 #undef FUNC_NAME
411
412
413 \f
414
415
416 unsigned long
417 scm_ihash (SCM obj, unsigned long n)
418 {
419 return (unsigned long) scm_raw_ihash (obj, 10) % n;
420 }
421
422 SCM_DEFINE (scm_hash, "hash", 2, 0, 0,
423 (SCM key, SCM size),
424 "Determine a hash value for @var{key} that is suitable for\n"
425 "lookups in a hashtable of size @var{size}, where @code{equal?}\n"
426 "is used as the equality predicate. The function returns an\n"
427 "integer in the range 0 to @var{size} - 1.")
428 #define FUNC_NAME s_scm_hash
429 {
430 unsigned long sz = scm_to_unsigned_integer (size, 1, ULONG_MAX);
431 return scm_from_ulong (scm_ihash (key, sz));
432 }
433 #undef FUNC_NAME
434
435
436 \f
437
438
439 void
440 scm_init_hash ()
441 {
442 #include "libguile/hash.x"
443 }
444
445
446 /*
447 Local Variables:
448 c-file-style: "gnu"
449 End:
450 */