Guile Mailing List Archive
[Date Prev][Date Next][Thread Prev][Thread Next][Date Index][Thread Index]
Re: Scheme style auto-resizing hashtable (fwd)
/*****************************************************************************
* Copyright (C) 1998, Jay Glascoe, NASA Goddard Institute for Space Studies
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*****************************************************************************/
/*****************************************************************************
* File "hashtab-type.c", by Jay Glascoe. 10/27/98
* Inspired by Jim Blandy's image-type example,
* the Python dictionary object, and a whole host
* of messages on the Guile mailing list.
* Special thanks to Maciej Stachowiak of MIT
* and Harvey J. Stein of BFM Financial Research.
*
* This is a Guile extension providing procedures to
* create and operate on auto-resizing hash tables.
*
* A hash table is a pair, it looks like this
*
* (header . vector)
*
* The car of the pair, the header, is a vector. It's used to
* keep track of various book keeping details. It looks
* like this
*
* #(number-entries number-nonempty-buckets auto-shrink-flag)
*
* The cdr of the hash table pair is also a vector. It's
* elements are referred to as "buckets" or "entry lists".
* It looks like this
*
* #(bucket1 bucket2 ... bucketN)
*
* Each bucket is a list of entries
*
* (entry1 entry2 ... entryN)
*
* Each entry is a pair, or "association". The car of the
* association is the "hash value" of the entry's key. The
* cdr of the association is a key-value pair. Entries look
* like this
*
* (hash (key . value))
*
* The procedures defined in this extension are listed below.
*
* primitive: make-hashtab
* Return a new auto-resizing hash table (auto-shrink is
* disabled by default).
*
* primitive: hashtab-enable hashtab symbol
* primitive: hashtab-disable hashtab symbol
* "symbol" should be one of 'auto-shrink, ... [no others yet]
* Turn the specified option on/off.
*
* primitive: hashtab-set! hashtab key value
* Insert "value" in hash table "hashtab" under key "key".
* If "key" is already present in "hashtab", overwrite the
* previous associated value with "value".
*
* primitive: hashtab-ref hashtab key [default]
* Look up "key" in the hash table "hashtab", and return the value
* (if any) associated with it. If key is not found, return
* default (or #f if no default argument is supplied).
*
* primitive: hashtab-del! hashtab key [default]
* Delete the entry in hash table "hashtab" having key "key",
* and return the value associated with "key" (if any).
* If key is not found, return default (or #f if no default
* argument is supplied).
*
*****************************************************************************/
#include <math.h>
#include <libguile.h>
#include <guile/gh.h>
/* #include "my-hasher.h" */
/* this is okay for hashes with up to 1,000,000 entries */
#define my_hasher(hash_ptr, obj, message, proc_name) \
*(hash_ptr) = scm_hasher((obj), 2097157L, 10)
#define DEFAULT_NUMBER_BUCKETS 4
#define MAX_MEAN_NONEMPTY_BUCKETS_SIZE 3 /* 3.14159265 ;) */
static SCM
my_scm_make_vector(register long i, SCM fill)
{
/* modified to allow for a long i (rather than a SCM i) */
SCM v;
register long j;
SCM *velts;
SCM_NEWCELL(v);
SCM_DEFER_INTS;
SCM_SETCHARS(v, scm_must_malloc(i?(long)(i*sizeof(SCM)):1L,
"my_scm_make_vector"));
SCM_SETLENGTH(v, i, scm_tc7_vector);
velts = SCM_VELTS(v);
j = 0;
while (--i >= j)
(velts)[i] = fill;
SCM_ALLOW_INTS;
return v;
}
#define make_dflt_bucket() SCM_EOL
#define make_entry(hash, key, value) \
scm_cons(scm_long2num(hash), scm_cons(key, value))
static SCM auto_shrink_symbol;
static SCM default_bucket;
static void
init_mysymbols(void) /* called by init_hashtab_type */
{
default_bucket =
SCM_CDR(scm_sysintern("*default-hashtab-bucket*", make_dflt_bucket()));
auto_shrink_symbol = SCM_CAR(scm_sysintern0("auto-shrink"));
return;
}
#define NUMBER_ENTRIES_INDEX 0
#define NUMBER_NONEMPTY_BUCKETS_INDEX 1
#define AUTO_SHRINK_FLAG_INDEX 2
#define NUMBER_HEADER_ELEMENTS 3
static SCM
make_header(SCM number_entries, SCM number_nonempty_buckets,
SCM auto_shrink_flag)
{
/* #(number-entries number-nonempty-buckets auto-shrink-flag)
*/
SCM vector = my_scm_make_vector(NUMBER_HEADER_ELEMENTS, SCM_UNDEFINED);
SCM *velts = SCM_VELTS(vector);
velts[NUMBER_ENTRIES_INDEX] = number_entries;
velts[NUMBER_NONEMPTY_BUCKETS_INDEX] = number_nonempty_buckets;
velts[AUTO_SHRINK_FLAG_INDEX] = auto_shrink_flag;
return vector;
}
static SCM
my_make_hashtab(SCM, SCM);
SCM_PROC(s_make_hashtab, "make-hashtab", 0, 0, 0, make_hashtab);
static SCM
make_hashtab(void)
{
/* an argument-less constructor. fool proof! ;)
*/
long inumber_buckets = DEFAULT_NUMBER_BUCKETS;
SCM auto_shrink_flag = SCM_BOOL_F;
return my_make_hashtab(inumber_buckets, auto_shrink_flag);
}
static SCM
my_make_hashtab(long inumber_buckets, SCM auto_shrink_flag)
{
SCM header = make_header(SCM_INUM0, SCM_INUM0, auto_shrink_flag);
SCM vector = my_scm_make_vector(inumber_buckets, SCM_EOL);
SCM hashtab = scm_cons(header, vector);
return hashtab;
}
SCM_PROC(s_hashtab_enable, "hashtab-enable", 2, 0, 0, hashtab_enable);
SCM_PROC(s_hashtab_disable, "hashtab-disable", 2, 0, 0, hashtab_disable);
static SCM
hashtab_enable(SCM hashtab, SCM symbol)
{
/* turn auto-shrink on */
SCM header = SCM_CAR(hashtab);
SCM *header_elts = SCM_VELTS(header);
SCM_ASSERT(symbol == auto_shrink_symbol,
symbol, SCM_ARG2, "hashtab-enable");
header_elts[AUTO_SHRINK_FLAG_INDEX] = SCM_BOOL_T;
return SCM_BOOL_T;
}
static SCM
hashtab_disable(SCM hashtab, SCM symbol)
{
/* turn auto-shrink off */
SCM header = SCM_CAR(hashtab);
SCM *header_elts = SCM_VELTS(header);
SCM_ASSERT(symbol == auto_shrink_symbol,
symbol, SCM_ARG2, "hashtab-disable");
header_elts[AUTO_SHRINK_FLAG_INDEX] = SCM_BOOL_T;
return SCM_BOOL_F;
}
SCM_PROC(s_hashtab_ref, "hashtab-ref", 2, 1, 0, hashtab_ref);
static SCM
hashtab_ref(SCM hashtab, SCM key, SCM not_here)
{
/* Given a hash table, a key, and (optionally) a "not-here"
* object, return the value associated with the key.
* Return "not-here" (#f by default) if there is no such
* key in the hashtable.
*/
SCM vector = SCM_CDR(hashtab);
SCM *velts = SCM_VELTS(vector);
long vec_len = SCM_LENGTH(vector);
long hash = 0;
long i;
SCM bucket, tail;
my_hasher(&hash, key, SCM_ARG2, "hashtab-ref");
i = hash & (vec_len - 1);
bucket = velts[i];
if (SCM_UNBNDP(not_here))
not_here = SCM_BOOL_F;
for (tail = bucket; tail != SCM_EOL; tail = SCM_CDR(tail))
{
SCM entry = SCM_CAR(tail);
SCM my_scm_hash = SCM_CAR(entry);
long myhash =
scm_num2long(my_scm_hash, "woops!", "hashtab-ref");
if (myhash == hash)
{
SCM pair = SCM_CDR(entry);
SCM mykey = SCM_CAR(pair);
if (scm_equal_p(mykey, key) == SCM_BOOL_T)
return SCM_CDR(pair);
}
}
return not_here;
}
static SCM
my_hashtab_set(SCM hashtab,long hash, SCM key, SCM value,int *resize_flag_ptr)
{
/* called by hashtab_setx
* Insert the key-value pair into the hashtable.
* Signal the need for resize if the table is too small.
*/
SCM myvec = SCM_CDR(hashtab);
SCM header = SCM_CAR(hashtab);
SCM *header_elts = SCM_VELTS(header);
long vec_len = SCM_LENGTH(myvec);
long i = hash & (vec_len - 1);
SCM number_entries = header_elts[NUMBER_ENTRIES_INDEX];
long inumber_entries = SCM_INUM(number_entries);
SCM *velts = SCM_VELTS(myvec);
SCM bucket = velts[i];
int bucket_size = 1;
SCM old_entry_list = bucket;
SCM entry_list;
if (bucket == SCM_EOL)
bucket_size = 0;
for (entry_list = old_entry_list;
entry_list != SCM_EOL;
entry_list = SCM_CDR(entry_list))
{
SCM entry = SCM_CAR(entry_list);
SCM my_scm_hash = SCM_CAR(entry);
SCM pair;
SCM mykey;
long myhash;
myhash = scm_num2long(my_scm_hash, "bif!", "hashtab-set!");
if (myhash != hash)
continue;
pair = SCM_CDR(entry);
mykey = SCM_CAR(pair);
if (scm_equal_p(mykey, key) == SCM_BOOL_T)
{
SCM_SETCDR(pair, value);
return value;
}
}
/* cons new entry onto bucket, and do some book-keeping
*/
{
SCM entry = make_entry(hash, key, value);
SCM number_entries = header_elts[NUMBER_ENTRIES_INDEX];
long inumber_entries = SCM_INUM(number_entries);
SCM number_nonempty_buckets =
header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX];
long inumber_nonempty_buckets = SCM_INUM(number_nonempty_buckets);
if (bucket_size == 0)
{
++inumber_nonempty_buckets;
header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX] =
SCM_MAKINUM(inumber_nonempty_buckets);
velts[i] = scm_cons(entry, SCM_EOL);
}
else
SCM_SETCDR(bucket, scm_cons(entry, SCM_CDR(bucket)));
++inumber_entries;
header_elts[NUMBER_ENTRIES_INDEX] = SCM_MAKINUM(inumber_entries);
if (inumber_entries >
MAX_MEAN_NONEMPTY_BUCKETS_SIZE * inumber_nonempty_buckets)
*resize_flag_ptr = 1;
return value;
}
}
SCM_PROC(s_hashtab_setx, "hashtab-set!", 3, 0, 0, hashtab_setx);
static SCM
hashtab_setx(SCM hashtab, SCM key, SCM value)
{
/* Calls my_hashtab_set after hashing the key argument.
* If my_hashtab_set signals that resize is necessary,
* then call resize_hashtab.
*/
long hash = 0;
int resize_flag = 0;
SCM retval;
my_hasher(&hash, key, SCM_ARG2, "hashtab-set!");
retval = my_hashtab_set(hashtab, hash, key, value, &resize_flag);
if (resize_flag == 1)
{
SCM new_hashtab = resize_hashtab(hashtab, 1);
SCM_SETCAR(hashtab, SCM_CAR(new_hashtab));
SCM_SETCDR(hashtab, SCM_CDR(new_hashtab));
}
return retval;
}
static SCM
resize_hashtab(SCM hashtab, int flag)
{
/* assuming (correctly) that each key from the old
* hash table is unique is a big win here, so we don't
* call hashtab_setx. instead, we duplicate its code
* and make the optimizing modifications
*/
SCM header = SCM_CAR(hashtab);
SCM *header_elts = SCM_VELTS(header);
SCM old_vec = SCM_CDR(hashtab);
SCM *old_velts = SCM_VELTS(old_vec);
register long old_len = SCM_LENGTH(old_vec);
SCM auto_shrink_flag = header_elts[AUTO_SHRINK_FLAG_INDEX];
register long inumber_nonempty_buckets = 0;
register long inumber_entries = 0;
long new_len;
SCM new_hashtab;
SCM *new_velts;
SCM *new_header_elts;
SCM my_inum1 = SCM_MAKINUM(1);
long new_len_minus_one;
register long i;
if (flag == -1) /* shrink the hash table */
new_len = old_len / 2;
else /* grow the hash table */
new_len = old_len * 2;
new_len_minus_one = new_len - 1;
new_hashtab = my_make_hashtab(new_len, auto_shrink_flag);
new_velts = SCM_VELTS(SCM_CDR(new_hashtab));
new_header_elts = SCM_VELTS(SCM_CAR(new_hashtab));
for (i = 0; i < old_len; ++i)
{
SCM entry_list;
SCM bucket = old_velts[i];
for (entry_list = bucket;
entry_list != SCM_EOL;
entry_list = SCM_CDR(entry_list),
SCM_SETCDR(bucket, entry_list)) /* delete the old bucket
* en passant.
*/
{
SCM entry = SCM_CAR(entry_list);
long myhash;
SCM my_scm_hash = SCM_CAR(entry);
long index = 0;
SCM new_bucket;
myhash = scm_num2long(my_scm_hash, "jeez!", "resize_hashtab");
index = myhash & new_len_minus_one;
new_bucket = new_velts[index];
if (new_bucket == SCM_EOL)
{
new_velts[index] = scm_cons(entry, SCM_EOL);
++inumber_nonempty_buckets;
}
else
SCM_SETCDR(new_bucket, scm_cons(entry, SCM_CDR(new_bucket)));
++inumber_entries;
}
/* let go of the old bucket (what's left of it anyway)
* so Guile may gc it. Apologies to anyone holding on
* to the old bucket, but any code depending on the previous
* state of the hash should fail gracefully (any such code
* shouldn't have been written in the first place).
*/
old_velts[i] = SCM_EOL;
}
/* book keeping
*/
{
SCM new_number_nonempty_buckets =SCM_MAKINUM(inumber_nonempty_buckets);
SCM new_number_entries = SCM_MAKINUM(inumber_entries);
SCM new_header = SCM_CAR(new_hashtab);
SCM *new_header_elts = SCM_VELTS(new_header);
new_header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX] =
new_number_nonempty_buckets;
new_header_elts[NUMBER_ENTRIES_INDEX] =
new_number_entries;
if (flag == -1 && inumber_entries
> MAX_MEAN_NONEMPTY_BUCKETS_SIZE * inumber_nonempty_buckets)
{
fprintf(stderr, "PANIC: downsized hash table prematurely. "
"disabling auto-shrink.\n");
fflush(stderr);
new_header_elts[AUTO_SHRINK_FLAG_INDEX] = SCM_BOOL_F;
}
return new_hashtab;
}
}
static void
my_scm_delete_x(register long hash, SCM *bucket_ptr, SCM key,
SCM *value_ptr, register short *found_flag_ptr)
{
/* heavily modified version of scm_delete_x.
* now PH balanced for women.
*/
SCM *prev;
SCM walk;
for (prev = bucket_ptr, walk = *bucket_ptr;
walk != SCM_EOL;
walk = SCM_CDR (walk))
{
SCM entry = SCM_CAR(walk);
if (hash == scm_num2long(SCM_CAR(entry), "tweet!", "my_scm_delete_x"))
{
SCM pair = SCM_CDR(entry);
if (SCM_BOOL_T == scm_equal_p(SCM_CAR(pair), key))
{
*value_ptr = SCM_CDR(pair);
*prev = SCM_CDR(walk);
*found_flag_ptr = 1;
break;
}
else
prev = SCM_CDRLOC (walk);
}
else
prev = SCM_CDRLOC (walk);
}
return;
}
SCM_PROC(s_hashtab_delx, "hashtab-del!", 2, 1, 0, hashtab_delx);
static SCM
hashtab_delx(SCM hashtab, SCM key, SCM not_here)
{
/* calls my_scm_delete_x for the actual deletion
* then performs book keeping and auto-shrinks if necessary
*
* NOTE: I've put as many optimizations as I can think of in
* here, but it's still slow (relative to Guile's fixed size
* hash tables). Why? Because (hash (key . value)) is
* significantly larger than just (key . value) and guile
* has an easier time gc'ing the smaller of the two.
*/
SCM vector = SCM_CDR(hashtab);
SCM *velts = SCM_VELTS(vector);
long vec_len = SCM_LENGTH(vector);
long hash = 0;
long i;
SCM value;
short found_flag = 0;
SCM *bucket_ptr;
if (SCM_UNBNDP(not_here))
not_here = SCM_BOOL_F;
my_hasher(&hash, key, SCM_ARG2, "hashtab-del!");
i = hash & (vec_len - 1);
bucket_ptr = velts + i;
my_scm_delete_x(hash, bucket_ptr, key, &value, &found_flag);
if (! found_flag)
return not_here;
{
SCM header = SCM_CAR(hashtab);
SCM *header_elts = SCM_VELTS(header);
SCM number_nonempty_buckets =
header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX];
long inumber_nonempty_buckets = SCM_INUM(number_nonempty_buckets);
SCM number_entries = header_elts[NUMBER_ENTRIES_INDEX];
long inumber_entries = SCM_INUM(number_entries);
SCM auto_shrink_flag = header_elts[AUTO_SHRINK_FLAG_INDEX];
if (*bucket_ptr == SCM_EOL)
{
--inumber_nonempty_buckets;
number_nonempty_buckets = SCM_MAKINUM(inumber_nonempty_buckets);
header_elts[NUMBER_NONEMPTY_BUCKETS_INDEX] =
number_nonempty_buckets;
}
--inumber_entries;
number_entries = SCM_MAKINUM(inumber_entries);
header_elts[NUMBER_ENTRIES_INDEX] = number_entries;
if (auto_shrink_flag == SCM_BOOL_F)
return value;
if (vec_len > DEFAULT_NUMBER_BUCKETS &&
2 * inumber_entries < inumber_nonempty_buckets
* MAX_MEAN_NONEMPTY_BUCKETS_SIZE)
{
SCM new_hashtab = resize_hashtab(hashtab, -1);
SCM_SETCAR(hashtab, SCM_CAR(new_hashtab));
SCM_SETCDR(hashtab, SCM_CDR(new_hashtab));
}
return value;
}
}
void
init_hashtab_type(void)
{
init_mysymbols();
scm_make_gsubr (s_make_hashtab, 0, 0, 0, make_hashtab);
scm_make_gsubr (s_hashtab_ref, 2, 1, 0, hashtab_ref);
scm_make_gsubr (s_hashtab_setx, 3, 0, 0, hashtab_setx);
scm_make_gsubr (s_hashtab_delx, 2, 1, 0, hashtab_delx);
scm_make_gsubr (s_hashtab_enable, 2, 0, 0, hashtab_enable);
scm_make_gsubr (s_hashtab_disable, 2, 0, 0, hashtab_disable);
}
Guile Home |
Main Index |
Thread Index