$invIndex=&buildInvIndex($ARGV[0]); print "Found ".(scalar(keys(%{$invIndex})))." unique words.\n"; open(O,">zipf.dat") || die(); for $word (sort {scalar(@{${$invIndex}{$b}}) <=> scalar(@{${$invIndex}{$a}})} keys(%{$invIndex})) { $i++; print O "$i ".scalar(@{${$invIndex}{$word}})." $word\n"; } close(O); print scalar(&search($invIndex,"galactic"))."\n"; print scalar(&search($invIndex,"center"))."\n"; print scalar(&search($invIndex,"galactic","center"))."\n"; print scalar(&search($invIndex,"center","galactic"))."\n"; exit; sub buildInvIndex { # build inverted index from file my($filename)=@_; my(%invIndex,$l,%wordsInDoc,@words,$word,$docno); open(F,$filename) || die(); while($l=) { # while not end of file if($l =~ //i) { # beginning of DOC $l=; # read next line $l=~/\s*(\d+)\s*<\/DOCNO>/ || die();# must contain DOCNO $docno=$1+0; %wordsInDoc=(); while(($l=) && ($l!~/<\/DOC>/i)) { # while not end of DOC chop $l; # remove CRLF $l=~tr/A-Z/a-z/; # make all lower case @words=split(/[ ,\.\;\:\-\!\?\n]+/,$l);# split line into words for $word (@words) { $wordsInDoc{$word}++; # add word to local histogram } } for $word (keys(%wordsInDoc)) { push(@{$invIndex{$word}},$docno); # add DOCNO to inv index } } } close(F); for $word (keys(%invIndex)) { # sort the inverted lists @{$invIndex{$word}}=sort {$a <=> $b} @{$invIndex{$word}}; } return(\%invIndex); } sub search { my($invIndex,@query)=@_; my(@accu,$i); @accu=@{${$invIndex}{$query[0]}}; # fill accumulator with first inv list for($i=1;$i