###################################################
# ShotgunResults:
#
# Data structure to facilitate working with shotgun result
# files.  Create empty structures with ShotgunResults->new().
# Each object has following public interface:
#
#  load()
#  bestSingleModel()
#  bestEnsemble()
#  getEnsembleMembers()
#  getScore()
#  print()
#  smallerIsBetter()
#  maxEnsembleSize()
#
# See comments by each method for how to use them.
#
# Global variables you can alter:
#   @ShotgunResults::SMALLMETRICS
#
# History:
#  2005/11/03  created by Art Munson
###################################################

package ShotgunResults;
use strict;

# List of metrics for whom low numbers are better.
# Note that when results are reported as loss ALL
# metrics treat small numbers as better.
#
# Calling scripts can access this array if they need to
# add items to it, e.g.
#   push(@ShotgunResults::SMALLMETRICS, "FOOBAR_METRIC");
our @SMALLMETRICS = ("CA1",
		     "CA2",
		     "CA3",
		     "CA4",
		     "CA5",
		     "CA6",
		     "CST",
		     "BL",
		     "MXE",
		     "NRM",
		     "RMS");

# package's private constants
my $ITER = 0;
my $METRIC = 1;
my $SCORE = 2;
my $MODEL = 3;



# ShotgunResults->new()
# Constructs and returns empty result object.
sub new {
    my $type = shift @_;  # Store the package name.
    my ShotgunResults $self = {
	records => [],
	hcSet => "",
	testSet => "",
	loss => 0
	};
    bless($self, $type);
    return $self;
}

# load(hcFile, testFile, hcName, testName, loss)
#
# Parameters:
#   hcFile = file containing results for hillclimbing dataset
#   testFile = file containing results for test dataset
#   hcName = label to use for hillclimb results
#   testName = label to use for test results
#   loss = whether results are in loss format or not (1 or 0)
#
# Pre:
#   The two result files are already aligned so that each line
#   matches by model-metric.
# Post:
#   Reads in the result files.
sub load {
    my ShotgunResults $self = shift @_;
    my ($hcFile, $testFile, $hcSet, $testSet, $loss) = @_;

    die("Missing required parms.")
	if (!defined($hcFile)
	    || !defined($testFile)
	    || !defined($hcSet)
	    || !defined($testSet)
	    || !defined($loss));

    $self->{hcSet} = $hcSet;
    $self->{testSet} = $testSet;
    $self->{loss} = $loss;

    my $recs = $self->{records};

    # Open hillclimb and test result files.
    open(HCFILE, "<$hcFile")
	or die("$hcFile: could not open file");
    open(TESTFILE, "<$testFile")
	or die("$testFile: could not open file");

    # Assume files already aligned by model and iteration (they should be).
    my $hcLine;
    my $testLine;

    # Read in the data, matching up the hc and test results by line.

    while (defined($hcLine = <HCFILE>) && defined($testLine = <TESTFILE>)) {
	my @hcFields = split(' ', $hcLine);
	my @testFields = split(' ', $testLine);

	# Sanity check that they are really aligned.
	my $aligned =
	       $hcFields[$METRIC] eq $testFields[$METRIC]
	    && $hcFields[$ITER] == $testFields[$ITER]
	    && $hcFields[$MODEL] eq $testFields[$MODEL];
	die "Mismatched lines:\n\t$hcLine\n\t$testLine\n" if (!$aligned);

	push(@{$recs}, {$hcSet => \@hcFields, $testSet => \@testFields});
    }

    close(TESTFILE);
    close(HCFILE);
}

# bestSingleModel(metric)
# Finds the best single model (not ensemble) with respect to
# the target metric on the hillclimb set.
# pre: true
# post: returns [1, model] pair that uniquely identifies model;
#       this reference to the pair can be used to lookup performance
#       of different metrics via getScore()
sub bestSingleModel {
    my ShotgunResults $self = shift @_;
    my $metric = shift @_;
    $metric = "\U$metric";

    # Filter records to
    #  a) only contain those pertaining to metric, and
    #  b) only single models.
    my $filtered = $self->filter($metric)->filterIteration(1);

    # Sort records by performance on metric.
    $filtered->sortByScore($metric);

    # Best single model is now first model in records.
    my ($iter, $m, $score, $model) = $filtered->getFields($self->{hcSet}, 0);
    die("bestSingleModel(): could not find a single model") if ($iter != 1);

    return [1, $model];
}

# bestEnsemble(metric)
# Finds the best ensemble (possibly single model) with respect to
# the target metric on the hillclimb set.
# pre: true
# post: returns [iter, model] pair that uniquely identifies ensemble;
#       this reference can be used to lookup performances via getScore()
sub bestEnsemble {
    my ShotgunResults $self = shift @_;
    my $metric = shift @_;
    $metric = "\U$metric";

    # Filter records to only contain those pertaining to metric.
    my $filtered = $self->filter($metric);

    # Sort records by performance on metric.
    $filtered->sortByScore($metric);

    # Best ensemble is now first model in records.
    my ($iter, $m, $score, $model) = $filtered->getFields($self->{hcSet}, 0);
    return [$iter, $model];
}

# getEnsembleMembers([iter,model]) Gets the models contained in the
# specified ensemble. The ensemble is identified with a reference to
# the pair [iter,model].
# example: getEnsembleMembers($results->bestEnsemble("ACC"))
# post: returns the list of modelsin the ensemble; note there are
# usually duplicates in the list
sub getEnsembleMembers {
    my ShotgunResults $self = shift @_;
    my $id = shift @_;
    my $bestIter = $id->[0];

    # Prune record set down to only contain one metric.
    # (We can use any metric here.)
    my $filtered = $self->filter("ACC");

    my @members;
    my $hcSet = $self->{hcSet};

    for (my $i = 0; $i < $bestIter; ++$i) {
	my ($iter, $m, $score, $model) = $filtered->getFields($hcSet, $i);
	die "bad iteration found\n" if ($iter > $bestIter);
	push(@members, $model);
    }

    return @members;
}

# getScore([iter,model], metric, dataset)
# Gets the performance of the specified ensemble (or single model)
# for the given metric on the dataset.  The ensemble is identified
# with a reference to the pair [iter,model].
# example: getScore($results->bestEnsemble("ACC"), "ACC", "hc")
# post: returns the score found, or -1 if the result cannot be found
sub getScore {
    my ShotgunResults $self = shift @_;
    my ($id, $metric, $set) = @_;
    my $iter = $id->[0];
    my $model = $id->[1];
    $metric = "\U$metric";

    my $perf = -1;
    my $numRecs = scalar(@{$self->{records}});

    for (my $row = 0; $row < $numRecs; ++$row) {
	my ($rIter, $rMetric, $rScore, $rModel) = $self->getFields($set, $row);
	my $found = $iter == $rIter
	    && $metric eq $rMetric
	    && $model eq $rModel;
	if ($found) {
	    $perf = $rScore;
	    last;
	}
    }

    return $perf;
}

# print(filehandle)
# post: print this object's results with format
#   <hillclimb fields> <test fields>
# example:
#   $sresults->print(*STDERR);
# note: If file handle is omitted, prints to STDOUT.
sub print {
    my ShotgunResults $self = shift @_;
    my ($fh) = @_;
    $fh = *STDOUT if (!defined($fh));

    for my $row (@{$self->{records}}) {
	for my $set ($self->getSets()) {
	    my @fields = @{$row->{$set}};
	    print $fh "@fields\t";
	}
	print $fh "\n";
    }
}

# smallerIsBetter(metric)
# pre: load() previously called
# post: returns 1 if results loaded in loss mode or metric treats smaller
#       values as better; 0 otherwise
sub smallerIsBetter {
    my ShotgunResults $self = shift @_;
    my ($metric) = @_;
    $metric = "\U$metric";

    # When results reported as loss, smaller is always better.
    return 1 if ($self->{loss});

    # Non-loss case: iterate through list of small metrics and
    # check for match.
    for my $small (@SMALLMETRICS) {
	return 1 if ($small eq $metric);
    }
    return 0;
}

# maxEnsembleSize([metric])
#
# Returns the size of the largest ensemble in the results for the
# given metric.  If the metric is not specified, RMS is used.  The
# only reason to specify a metric is if the results to contain scores
# for the RMS metric; all metrics present in results should have equally
# large maximum ensembles.
sub maxEnsembleSize {
    my ShotgunResults $self = shift @_;
    my ($metric) = @_;
    $metric = "RMS" if (!defined($metric));
    $metric = "\U$metric";

    my $filtered = $self->filter($metric);
    my $count = scalar(@{$filtered->{records}});
    return $count;
}

################################################
# Private methods below here.
# It's too hard to really hide them, so you are on the honor system.
################################################

sub filter {
    my ShotgunResults $self = shift @_;
    my ($metric) = @_;

    # Make copy to fill with filtered records.
    my $filtered = ShotgunResults->new();
    my $hcSet = $self->{hcSet};
    $filtered->{hcSet} = $hcSet;
    $filtered->{testSet} = $self->{testSet};
    $filtered->{loss} = $self->{loss};

    my @filteredRecs;

    for my $r (@{$self->{records}}) {
	if ($r->{$hcSet}->[$METRIC] eq $metric) {
	    push(@filteredRecs, $r);
	}
    }

    $filtered->{records} = \@filteredRecs;

    return $filtered;
}

sub filterIteration {
    my ShotgunResults $self = shift @_;
    my ($iter) = @_;

    # Make copy to fill with filtered records.
    my $filtered = ShotgunResults->new();
    my $hcSet = $self->{hcSet};
    $filtered->{hcSet} = $hcSet;
    $filtered->{testSet} = $self->{testSet};
    $filtered->{loss} = $self->{loss};

    my @filteredRecs;

    for my $r (@{$self->{records}}) {
	if ($r->{$hcSet}->[$ITER] == $iter) {
	    push(@filteredRecs, $r);
	}
    }

    $filtered->{records} = \@filteredRecs;

    return $filtered;
}

sub getSets {
    my ShotgunResults $self = shift @_;
    return ($self->{hcSet}, $self->{testSet});
}



# pre: call filter(metric) first
sub sortByScore {
    my ShotgunResults $self = shift @_;
    my ($metric) = @_;
    my $hcSet = $self->{hcSet};

    # If bigger is better, need to flip sort order.  We can do
    # this by multiplying the comparison number by -1.
    my $inverter = ($self->smallerIsBetter($metric)) ? 1 : -1;

    my @sorted = sort {
	my $scoreA = $a->{$hcSet}->[$SCORE];
	my $scoreB = $b->{$hcSet}->[$SCORE];
	if ($scoreA == $scoreB) {
	    # break ties by choosing lower iteration.
	    # Note that there is never need to invert order here.
	    my $iterA = $a->{$hcSet}->[$ITER];
	    my $iterB = $b->{$hcSet}->[$ITER];
	    return ($iterA <=> $iterB);
	}
	return ($scoreA <=> $scoreB) * $inverter;
    } @{$self->{records}};

    $self->{records} = \@sorted;
}



# getFields(dataset, row)
#  returns (iteration, metric, score, model)
sub getFields {
    my ShotgunResults $self = shift @_;
    my ($dataset, $row) = @_;
    my $data = $self->{records}->[$row]->{$dataset};
    return ($data->[$ITER],
	    $data->[$METRIC],
	    $data->[$SCORE],
	    $data->[$MODEL]);
}





1;
