/* Ideas: Prime numbers around 1000, 2000, 4000, ... 16000 are chosen for the table sizes, so that each hashtable can be extended to about twice its original size when it is 0.8 full. The hash function hashes a string according to the method described in lecture. In the hashtable structure, information such as a pointer to the table entries, the table sizes used, the number of hash table accesses, the no. of distinct words in the table, and the number of probes are stored. Each table entry is also a structure, containing the word and the frequency of the word. */ /* Program to compare the performance of linear probing and double hashing in counting the frequency of words in a text file. The program maintains two open-addressed hash tables, one of which uses linear probing and the other double hashing. It reads the words from a text file, and adds each word into both hash tables, and prints out various statistics at the end. */ #include #include #include #include #include "hashTable.h" /* Main program. For flexibility, the name of the text file to be processed is given as a command line argument. */ int main(int argc, char **argv) { char *usagestr = "Usage:\t%s filename\n"; FILE *fp; char word[MAXWORDLEN]; HashTbl lineartbl, /* hash table using linear probing */ doubletbl; /* hash table using double hashing */ int maxfreq, /* maximum frequency of words */ tablesize, /* current table size, same for both tables */ i; /* for looping */ char *maxword; /* most frequent word */ int totalnumwords = 0; /* number of words in text file */ float lineartime = 0, /* total time used for hashing keys using linear probing */ doubletime = 0, /* total time used for hashing keys using double hasing */ maxlineartime = 0,/* max time for hashing a key using linear probing */ maxdoubletime = 0,/* max time for hashing a key using double hashing */ maxlinrehash = 0, /* max time spent rehashing lineartbl */ maxdblrehash = 0, /* max time spent rehashing doubletbl */ t; /* time in seconds */ double start, end; /* used for timing purposes */ /* check that name of text file is given on the command line */ if (argc != 2) { fprintf(stderr, usagestr, argv[0]); exit(1); } /* Build hash tables */ /* initialize hash tables */ InitHashTbl(&lineartbl, LINEAR); InitHashTbl(&doubletbl, DOUBLE); /* open text file for reading */ printf("Processing %s\n\n", argv[1]); if ((fp = fopen(argv[1], "r")) == NULL) { perror("BuildHash"); exit(1); } /* read words from file and keep timing statistics until exhausted; here we assume that no reading errors will occur, so that fscanf either returns 1 if a word was read, or EOF when the end of the text file has been reached */ while (fscanf(fp, "%s", word) == 1) { start = clock(); t = AddWord(&lineartbl, word); end = clock(); if (t > maxlinrehash) { maxlinrehash = t; } t = (float)((end - start) / CLOCKS_PER_SEC); if (t > maxlineartime) { maxlineartime = t; } lineartime += t; start = clock(); t = AddWord(&doubletbl, word); end = clock(); if (t > maxdblrehash) { maxdblrehash = t; } t = (float)((end - start) / CLOCKS_PER_SEC); if (t > maxdoubletime) { maxdoubletime = t; } doubletime += t; totalnumwords++; } fclose(fp); /* print statistics */ maxfreq = FindMax(&maxword, &lineartbl); tablesize = lineartbl.tblsizes[lineartbl.numextended]; printf("Number of words = %d\n", totalnumwords); printf("Number of distinct words = %d\n", lineartbl.numwords); printf("Load factor = %.2f (table size = %d)\n", (double) lineartbl.numwords / tablesize, tablesize); printf("Maximum frequency of a word = %d\n", maxfreq); printf("Most frequent word is %s\n\n", maxword); printf("Linear probing statistics:\n"); printf("Table sizes used: "); for (i = 0; i <= lineartbl.numextended; i++) { printf("%d ",lineartbl.tblsizes[i]); } printf("\nMaximum probe length = %d\n", lineartbl.maxprobelen); printf("Average probe length = %.2f\n", (double) lineartbl.numprobes / lineartbl.numaccess); printf("Maximum time spent for hashing a key = %2.5f secs\n",maxlineartime); printf("Average time spent for hashing a key = %2.5f secs\n", lineartime / totalnumwords); printf("Maximum time spent rehashing when extending table = %2.2f secs\n\n", maxlinrehash); printf("Double hashing statistics:\n"); printf("Table sizes used: "); for (i = 0; i <= doubletbl.numextended; i++) { printf("%d ",doubletbl.tblsizes[i]); } printf("\nMaximum probe length = %d\n", doubletbl.maxprobelen); printf("Average probe length = %.2f\n", (double) doubletbl.numprobes / doubletbl.numaccess); printf("Maximum time spent for hashing a key = %2.5f secs\n",maxdoubletime); printf("Average time spent for hashing a key = %2.5f secs\n", doubletime / totalnumwords); printf("Maximum time spent rehashing when extending table = %2.2f secs\n\n", maxdblrehash); return 0; }