# vectorizer.pl: Creates normalized binary document vectors and runs SVM-light # Author: Thorsten Joachims, 13.10.2004 # # Start with command: perl vectorizer.pl # # NOTE: assumes that svm_learn and svm_classify are in current directory # NOTE: could be much faster, if the document vectors were not generated # from scratch for each label # define dataset and labels @labels=("astro-ph","cond-mat","cs","gr-qc","hep-ex","hep-lat","hep-ph","hep-th","math-ph","math","nucl-ex","nucl-th","physics","quant-ph"); $traindocfile="arxiv_doc.train"; $testdocfile="arxiv_doc.test"; $trainlabelfile="arxiv_classes.train"; $testlabelfile="arxiv_classes.test"; $|=1; # always flush stdout immediately after print print "Gathering words for lexicon..."; %lexicon=&createLexicon($traindocfile); print "done.\n"; for $label (@labels) { print "Creating SVM files for label '$label'..."; &createSVMFile($traindocfile,$trainlabelfile,$label,"temp.train",\%lexicon); &createSVMFile($testdocfile,$testlabelfile,$label,"temp.test",\%lexicon); print "done.\n"; print "Running SVM-light for label '$label':\n"; system("./svm_learn -c 1 temp.train temp.model"); system("./svm_classify temp.test temp.model temp.pred"); } exit; sub createLexicon { # create lexicon by collecting all words in file my($indocfile)=@_; my(%lexicon,$l,@words,$word); open(F,$indocfile) || die(); while($l=) { # while not end of file if($l =~ //i) { # beginning of DOC $l=; # read next line while(($l=) && ($l!~/<\/DOC>/i)) { # while not end of DOC chop $l; # remove CRLF @words=&tokenizeLine($l); # split line into words for $word (@words) { if(!$lexicon{$word}) { # add new words to lexicon $lexicon{$word}=scalar(keys(%lexicon))+1; } } } } } close(F); return(%lexicon); } sub createSVMFile { # create SVM-light input file my($indocfile,$inlabelfile,$label,$outfile,$lexicon)=@_; my(%labels,@label,$l,%wordsInDoc,@words,$word,$docno,$weight,$line); %labels=(); open(F,$inlabelfile) || die(); # read labels from file while($l=) { chop $l; ($docno,@label)=split(/\s+/,$l); $labels{$docno}=":".join(":",@label).":"; } close(F); open(F,$indocfile) || die(); open(O,">$outfile") || die(); while($l=) { # while not end of file if($l =~ //i) { # beginning of DOC $l=; # read next line $l=~/\s*(\d+)\s*<\/DOCNO>/ || die();# must contain DOCNO $docno=$1+0; %wordsInDoc=(); while(($l=) && ($l!~/<\/DOC>/i)) { # while not end of DOC chop $l; # remove CRLF @words=&tokenizeLine($l); # split line into words for $word (@words) { if(${$lexicon}{$word}) { $wordsInDoc{$word}=1; # add word to local histogram } } } # write document label and vector to next line in output file if($labels{$docno}=~/:$label:/i) { # is doc pos or neg? $line="+1"; } else { $line="-1"; } # write features in order of increasing feature number for $word (sort {${$lexicon}{$a} <=> ${$lexicon}{$b}} keys(%wordsInDoc)) { # use binary word weighting and normalize to unit length $weight=$wordsInDoc{$word}/sqrt(scalar(keys(%wordsInDoc))); $line .= " ".${$lexicon}{$word}.":$weight"; } print O "$line\n"; } } close(F); close(O); } sub tokenizeLine { my($line)=@_; my(@words); $line=~tr/A-Z/a-z/; # make all lower case @words=split(/[ ,\.\;\:\-\!\?\n]+/,$line); # split line into words return(@words); }