#!/usr/bin/perl -w

######################
# Documentation.
######################

=head1 NAME

agglib.pl

=head1 SYNOPSIS

agglib.pl [options] basedir algdir [algdir2 ...]

  Options:
    -exclude-series s1
                   exclude data series s1 from library
    -help|?        brief help message
    -library name  place the aggregated library in folder name instead of
                   in current directory
    -man           full documentation
    -quiet         do not print unnecessary messages
    -save-space    use soft links instead of copying to save space

=head1 DESCRIPTION

Aggregates models in separate folders into a single ensemble selection library structure.  Typically this structure looks like libname/testX/, where testX is a number of subfolders.  Each subfolder holds the model predictions over the data subset (aka data series) corresponding to testX.  There will also often be libname/train, holding the predictions from the models over the training data.

The script copies model predictions from basedir/algdir/* to the aggregated library, for all the specified algorithm subfolders.  Assuming that the data series for each algorithm are 'train' and 'test1', the command

   agglib.pl myModels dt svm

makes folders 'train' and 'test1', and performs the following copying:

   myModels/dt/train/*  --------> train/
   myModels/svm/train/* --------> train/
   myModels/dt/test1/*  --------> test1/
   myModels/svm/test1/* --------> test1/

Unless the -exclude-series options is used, all sub-directories found in basedir/algdir/ are assumed to be data series.

=head1 OPTIONS

=over 3

=item B<-exclude-series> s1

Exclude the given data series from the aggregated library.  For example, '-exclude-series train' would exclude model predictions over the train data series from the library.  If this option is not specified agglib.pl uses all the data series found under basedir/<algdir>/, assuming each subdirectory is a separate data series.  Use the option multiple times to exclude multiple series.

=item B<-help>

Prints a brief help message and exits.

=item B<-library> name

Make a directory 'name' and place the aggregated library in that directory. If this option is not specified the library will be placed in the current directory.

=item B<-man>

Prints the manual page and exits.

=item B<-quiet>

Suppress unnecessary messages.  These include messages about files already existing in the newly aggregated library.

=item B<-save-space>

Instead of copying model files to the aggregated library, use soft links to link to the original files.  This will save consideral hard drive space.

=back

=cut

#######################################
# Implementation
#######################################

use strict;
use Getopt::Long;
use Pod::Usage;

my $help = 0;
my $man = 0;
my @exclude;
my $agglibrary = ".";
my $saveSpace = 0;
my $quiet = 0;

# Parse options and print usage if there is a syntax error.
# (or if usage explicitly requested)
GetOptions("help|?" => \$help,
           "man" => \$man,
	   "exclude-series=s" => \@exclude,
	   "library=s" => \$agglibrary,
	   "quiet" => \$quiet,
	   "save-space" => \$saveSpace)
    or pod2usage(-verbose => 0);

pod2usage(-verbose => 1) if $help;
pod2usage(-verbose => 2) if $man;

# Collect arguments and perform some sanity checks.

my $basedir = shift(@ARGV)
    or pod2usage("$0: No basedir argument given.");
my @algdirs = @ARGV;
pod2usage("$0: No algdir argument(s) given.") if (scalar(@algdirs) == 0);

my @series = enumerateSeries($basedir, \@algdirs, \@exclude);


# Make the aggregate library folder, if it doesn't already exist.

if (!($agglibrary eq ".") && !(-e $agglibrary)) {
    `mkdir -p $agglibrary`;
}

for my $s (@series) {
    my $destination = "$agglibrary/$s";
    `mkdir $destination` if (! (-e $destination));
    for my $alg (@algdirs) {
	my @models = split(' ', `ls $basedir/$alg/$s`);
	for my $m (@models) {
	    addToLibrary("$basedir/$alg/$s/$m", "$destination/$m");
	}
    }
}

##############################
# Helper functions.
##############################

sub enumerateSeries {
    my ($basedir, $algdirs, $exclude) = @_;

    my %skip;
    for my $except (@{$exclude}) {
	$skip{$except} = 1;
    }

    my %series;

    for my $alg (@{$algdirs}) {
	my $folder = "$basedir/$alg";
	if (!(-d $folder && -r $folder)) {
	    pod2usage("$0: $folder does not exist");
	}

	my @subfolders = split(' ',`ls $folder`);
	for my $f (@subfolders) {
	    next if ($skip{$f});   # Skip if listed as exclusion
	    $series{$f} += 1;
	}
    }

    # Check that each series is represented by all algorithms.

    for my $s (keys %series) {
	if ($series{$s} != scalar(@{$algdirs})) {
	    print STDERR "Not all algorithms contain the same data series.\n";
	    exit 1;
	}
    }

    return (keys %series);
}

sub addToLibrary {
    my ($original, $copy) = @_;

    # Avoid overwriting existing files.
    # This can protect against errors and save doing needless work.
    if (-e $copy) {
	print STDERR "*** $copy already exists\n" if (!$quiet);
	return;
    }

    if ($saveSpace) {
	`ln -s $original $copy`;
	die("$0: failed to make soft link $copy to $original") if $?;
    }
    else {
	`cp -p $original $copy`;
	die("$0: failed to copy $original to $copy") if $?;
    }
}
