# vectorizer.pl: Creates normalized binary document vectors and runs SVM-light
# Author: Thorsten Joachims, 13.10.2004
# 
# Start with command: perl vectorizer.pl
#
# NOTE: assumes that svm_learn and svm_classify are in current directory
# NOTE: could be much faster, if the document vectors were not generated
#       from scratch for each label

# define dataset and labels
@labels=("astro-ph","cond-mat","cs","gr-qc","hep-ex","hep-lat","hep-ph","hep-th","math-ph","math","nucl-ex","nucl-th","physics","quant-ph");
$traindocfile="arxiv_doc.train";
$testdocfile="arxiv_doc.test";
$trainlabelfile="arxiv_classes.train";
$testlabelfile="arxiv_classes.test";

$|=1; # always flush stdout immediately after print

print "Gathering words for lexicon...";
%lexicon=&createLexicon($traindocfile);
print "done.\n";

for $label (@labels) {
    print "Creating SVM files for label '$label'...";
    &createSVMFile($traindocfile,$trainlabelfile,$label,"temp.train",\%lexicon);
    &createSVMFile($testdocfile,$testlabelfile,$label,"temp.test",\%lexicon);
    print "done.\n";
    print "Running SVM-light for label '$label':\n";
    system("./svm_learn -c 1 temp.train temp.model");
    system("./svm_classify temp.test temp.model temp.pred");
}

exit;

sub createLexicon {  # create lexicon by collecting all words in file
    my($indocfile)=@_;
    my(%lexicon,$l,@words,$word);
    open(F,$indocfile) || die();
    while($l=<F>) {                                    # while not end of file
	if($l =~ /<DOC>/i) {                           # beginning of DOC
	    $l=<F>;                                    # read next line
	    while(($l=<F>) && ($l!~/<\/DOC>/i)) {      # while not end of DOC
		chop $l;                               # remove CRLF
		@words=&tokenizeLine($l);              # split line into words 
		for $word (@words) {
		    if(!$lexicon{$word}) {       # add new words to lexicon
		        $lexicon{$word}=scalar(keys(%lexicon))+1;    
                    } 
		}
	    }
	}
    }
    close(F);
    return(%lexicon);
}

sub createSVMFile {  # create SVM-light input file
    my($indocfile,$inlabelfile,$label,$outfile,$lexicon)=@_;
    my(%labels,@label,$l,%wordsInDoc,@words,$word,$docno,$weight,$line);
    %labels=();
    open(F,$inlabelfile) || die();                     # read labels from file
    while($l=<F>) {                                    
	chop $l;
	($docno,@label)=split(/\s+/,$l);
	$labels{$docno}=":".join(":",@label).":";
    }
    close(F);
    open(F,$indocfile) || die();
    open(O,">$outfile") || die();
    while($l=<F>) {                                    # while not end of file
	if($l =~ /<DOC>/i) {                           # beginning of DOC
	    $l=<F>;                                    # read next line
	    $l=~/<DOCNO>\s*(\d+)\s*<\/DOCNO>/ || die();# must contain DOCNO
	    $docno=$1+0;
	    %wordsInDoc=();
	    while(($l=<F>) && ($l!~/<\/DOC>/i)) {      # while not end of DOC
		chop $l;                               # remove CRLF
		@words=&tokenizeLine($l);              # split line into words 
		for $word (@words) {
                    if(${$lexicon}{$word}) {
		        $wordsInDoc{$word}=1;    # add word to local histogram
		    }
	        }
	    }
            # write document label and vector to next line in output file
	    if($labels{$docno}=~/:$label:/i) {        # is doc pos or neg?
		$line="+1";
	    }
	    else {
		$line="-1";
	    }
            # write features in order of increasing feature number
	    for $word (sort {${$lexicon}{$a} <=> ${$lexicon}{$b}} keys(%wordsInDoc)) {
                # use binary word weighting and normalize to unit length
                $weight=$wordsInDoc{$word}/sqrt(scalar(keys(%wordsInDoc)));
                $line .= " ".${$lexicon}{$word}.":$weight";
	    }
            print O "$line\n";
        }
    }
    close(F);
    close(O);
}

sub tokenizeLine {
    my($line)=@_;
    my(@words);
    $line=~tr/A-Z/a-z/;                         # make all lower case
    @words=split(/[ ,\.\;\:\-\!\?\n]+/,$line);  # split line into words 
    return(@words);
}