#!/usr/bin/perl
use strict 'refs';
use warnings;
use Text::CSV;
use Text::CSV::Encoded;
use File::Basename;

sub usage {
    my $base = &basename($0);
    print STDOUT "$base [-s source] -k key1 [ ... -k keyn] <CSV file> ...\n";
    print STDOUT "  Joins multiple CSV files into one CSV file using the\n";
    print STDOUT "  column(s) named using -k to define row identity\n";
    print STDOUT "\n";
    print STDOUT "  -s source : output only rows derived partly from that source\n";
    exit 1;
}

my $nl = "\r\n";
my $csv = Text::CSV::Encoded->new({ encoding_in => "UTF-8",
                                    encoding_out => "UTF-8" });

# A source that must be present in each output row
my $req_source;

while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) {
    my $opt = shift @ARGV;
    if ($opt eq '-k') {
        my $k = shift @ARGV;
        push @key_fields, $k;
        $key_index{$k} = $#key_fields;
    } elsif ($opt eq '-s') {
        $req_source = shift @ARGV
    } else {
        print STDERR "Unknown option $opt.\n";
        usage()
    }
}

if ($#ARGV < 0 || $#key_fields < 0) { usage() }

my (%colindex, %db, %files_for_key, %file_cols, %file_key_locs);
my (@allcols, %is_column, %all_keys);

# remove leading and trailing whitespace and commas and
# collapse multiple whitespace into one.
sub clean {
    my $x = $_[0];
    chomp $x;
    $x =~ s/\s*$//g;
    $x =~ s/\s\s+$/ /g;
    $x =~ s/^\s*//g;
    $x =~ s/,*$//g;
    $x =~ s/^,*//g;
    return $x;
}

# undef $/;

#
# Read the input files
#
for (my $i = 0; $i <= $#ARGV; $i++) {
    my $file = $ARGV[$i];
    my $file_out = $file;
    $file_out =~ s/\.csv$//;
    open my $fileh, '<', $file;
    if (!defined($fileh)) {
        print STDERR "Could not open $file\n";
        exit 1;
    }
    bless $fileh, "IO::File";
    my $row = $csv->getline($fileh);
    if (!defined($row)) {
        print "Could not read CSV header line from $file\n";
        # print <$csv_file>;
        exit 1;
    }
    #my @lines = split /\r\n?|\n/, $file_data;
    #my $hdr = $lines[0];
    #if (!$csv->parse($hdr)) {
        #print STDERR "Illegal syntax in $file line 1: $hdr";
        #exit 1;
    #}
    my @colnames = @{$row};
    my %ci;
    my @key_locs;
    for (my $j = 0; $j <= $#colnames; $j++) {
        my $col = $colnames[$j];
        if (!$is_column{$col}) {
            $is_column{$col} = 1;
            push @allcols, $col;
        }
# check whether this is a key column and track where it is in this file
        my $ki = $key_index{$col};
        if (defined($ki)) { $key_locs[$ki] = $j }
    }
    my $skip = 0; # is this file broken?
    for (my $j = 0; $j <= $#key_fields; $j++) {
        if (!defined($key_locs[$j])) {
            print STDERR "File $file is missing column \"$key_fields[$j]\", skipping.\n";
            $skip = 1;
        }
    }
    if ($skip) { next }
    my %rows;
    my $lineno = 1;
    for (my $lineno = 1; 1; $lineno++) {
        my $row = $csv->getline($fileh);
        if (!defined($row)) {
            last
        }

        my @data = @{$row};
        foreach (my $k = 0; $k <= $#data; $k++) {
            $data[$k] = &clean($data[$k])
        }
        my $key = '';
        for (my $ki = 0; $ki <= $#key_fields; $ki++) {
            $key .= ':';
            if (!defined($key_locs[$ki])) {
                print STDERR "key field $ki is missing!? ($key_fields[$ki])\n";
            }
            $key .= $data[$key_locs[$ki]];
        }

        #print STDERR "Adding $key\n";
        $rows{$key} = \@data;
        if (!defined($files_for_key{$key})) {
            $files_for_key{$key} = [()];
        }
        my @files = @{$files_for_key{$key}};
        push @files, $file_out;
        $files_for_key{$key} = \@files;
    }
    $db{$file_out} = \%rows;
    $file_key_locs{$file_out} = \@key_locs;
    my %col_loc;
    for (my $i = 0; $i <= $#colnames; $i++) {
        my $col = $colnames[$i];
        $col_loc{$col} = $i;
        #print STDERR "file $file : col $col at $i\n";
    }
    $file_cols{$file_out} = \%col_loc;

    close($fileh);
}

my @keys;

foreach my $k (keys %files_for_key) {
    push @keys, $k;
}

@keys = sort {$a cmp $b} @keys;

print STDERR ($#keys + 1), " distinct keys seen.\n";

#
# Generate output
#

$csv->combine(@allcols, "Source");
print $csv->string(), $nl;

foreach my $key (@keys) {
    # print STDERR "Looking at $key", $nl;
    my @files = @{$files_for_key{$key}};
    my %rows;
    my $files_list = "@files";
    my $firstfile = $files[0];
    my @row = ();
    if (defined($req_source)) {
        my $sel = 0;
        foreach $f (@files) {
            if ($f eq $req_source) { $sel = 1 }
        }
        if (!$sel) { next }
    }
    foreach my $f (@files) {
        my $file_rows = $db{$f};
        $rows{$f} = $file_rows->{$key};
    }
# Merge the data for this column from all files
    for (my $c = 0; $c <= $#allcols; $c++) {
        my $col = $allcols[$c];
        # print STDERR "Doing $col for $key\n";
        my $any_conflict = 0;
        my $curval;
        foreach my $f (@files) {
            my $col_loc = $file_cols{$f};
            if (!defined($col_loc->{$col})) { next }
            # print STDERR "In $f this is at column $col_loc->{$col}\n";
            my $val = $rows{$f}->[$col_loc->{$col}];
            #print STDERR "$key: Looking for $col in $f at $col_loc->{$col}, found $val\n";
            if (!defined($curval)) {
                $curval = $val
            } elsif ($curval ne $val) {
                if (!$any_conflict) {
                    $curval = "$firstfile:$curval";
                    $any_conflict = 1;
                }
                $curval .= " $f:$val";
            }
        }
        $row[$c] = $curval;
        #print STDERR "Setting $c ($col) to $curval\n";
    }
    $csv->combine(@row, $files_list);
    print $csv->string(), $nl;
}
