#include "hashTable.h" /************************* Hash Table Routines *************************/ /* Hash function based on division method. The given word is treated as a number in base 128, and its remainder when divided by the given modulus is computed. */ static int DivMethod(const char *word, int modulus) { int rem = 0; /* go through each character in the word and accumulate it into the running remainder; for a word of k characters, the value of rem at the end of the for loop would be: (word[0]*128^(k-1) + word[1]*128^(k-2) +...+ word[k-1]) % modulus */ for (; *word; word++) rem = ((rem << 7) + *word) % modulus; return rem; } /* Initializes a hash table */ void InitHashTbl(HashTbl *tbl, HashType type) { int tablesize; /* current table size */ tbl->type = type; tbl->numaccess = 0; tbl->maxprobelen = 0; tbl->numprobes = 0; tbl->numextended = 0; tablesize = tbl->tblsizes[0] = primenums[0]; /* sets to current table size */ /* set all entries to be 0 */ tbl->entry = (TblEntry*)calloc(tablesize,sizeof(TblEntry)); } /* Add a word into the hash table. If the word exists, increment its count. Note that this function is used for both hash tables. Note that this function is also used when extending the hashtable. function returns 0.0 if no rehashing is done, otherwise it returns the time taken to rehash the table. */ float AddWord(HashTbl *tbl, char *word) { int posn, /* current table location to examine */ incr, /* displacement to the next table location */ tries, /* number of table locations examined */ tablesize; /* current table size */ float t = 0.0; /* time taken for rehashing */ tablesize = tbl->tblsizes[tbl->numextended]; /* checks if number of words in hash table exceeds 0.8 of the table size. if so, extend the table and rehash all the words accordingly */ if (tbl->numwords >= 0.8 * tablesize) { double start, end; TblEntry *temp = tbl->entry; int i, j = ++(tbl->numextended); start = clock(); tbl->tblsizes[j] = primenums[j]; tbl->entry = (TblEntry*)calloc(primenums[j],sizeof(TblEntry)); tbl->numwords = 0; for (i = 0; i < tablesize; i++) { if (temp[i].word[0] != 0) { /* add each word to the newly extended table, and when AddWord() returns, we adjust freq to the correct value. if we do not adjust, then freq of each word would just be 1 */ AddWord(tbl, temp[i].word); tbl->entry[tbl->pos].freq = temp[i].freq; } } free(temp); tablesize = primenums[j]; end = clock(); t = (float)((end - start) / CLOCKS_PER_SEC); } /* compute starting position and displacement using division method; for linear probing, the displacement is set to 1, while for double hashing, it is set to a nonzero value less than tablesize */ posn = DivMethod(word, tablesize); incr = (tbl->type == LINEAR) ? 1 : 1 + DivMethod(word, tablesize - 2); /* search for word in hash table; we try up to tablesize different locations as given by the open address sequence, and if the word has not been found or inserted by then, the hash table must be full */ for (tries = 1; tries <= tablesize; tries++) { /* insert if blank table entry */ if (tbl->entry[posn].word[0] == 0) { strcpy(tbl->entry[posn].word, word); tbl->entry[posn].freq = 1; /* update statistics */ tbl->numwords++; tbl->numaccess++; tbl->numprobes += tries; if (tries > tbl->maxprobelen) tbl->maxprobelen = tries; tbl->pos = posn; return t; } /* update frequency if word is found */ if (strcmp(tbl->entry[posn].word, word) == 0) { tbl->entry[posn].freq++; /* update statistics */ tbl->numaccess++; tbl->numprobes += tries; tbl->pos = posn; return t; } /* otherwise go to next table entry in open-address sequence */ posn = (posn + incr) % tablesize; } /* no room for word */ fprintf(stderr, "AddWord: hash table exhausted\n"); exit(1); } /* Finds the most frequently occuring word in a hash table. Returns the frequency of that word, and stores that word in *maxword. */ int FindMax(char **maxword, HashTbl *tbl) { int i; int maxfreq, /* maximum frequency so far */ tablesize; /* current table size */ maxfreq = 0; *maxword = NULL; tablesize = tbl->tblsizes[tbl->numextended]; /* scan non-empty locations of hash table */ for (i = 0; i < tablesize; i++) { if (tbl->entry[i].word[0] != 0) { if (maxfreq < tbl->entry[i].freq) { maxfreq = tbl->entry[i].freq; *maxword = tbl->entry[i].word; } } } return maxfreq; }