#!/usr/bin/perl use strict 'refs'; use warnings; use Text::CSV; use Text::CSV::Encoded; use File::Basename; sub usage { my $base = &basename($0); print STDOUT "$base [-s source] -k key1 [ ... -k keyn] ...\n"; print STDOUT " Joins multiple CSV files into one CSV file using the\n"; print STDOUT " column(s) named using -k to define row identity\n"; print STDOUT "\n"; print STDOUT " -s source : output only rows derived partly from that source\n"; exit 1; } my $nl = "\r\n"; my $csv = Text::CSV::Encoded->new({ encoding_in => "UTF-8", encoding_out => "UTF-8" }); # A source that must be present in each output row my $req_source; while ($#ARGV >= 0 && $ARGV[0] =~ /^-/) { my $opt = shift @ARGV; if ($opt eq '-k') { my $k = shift @ARGV; push @key_fields, $k; $key_index{$k} = $#key_fields; } elsif ($opt eq '-s') { $req_source = shift @ARGV } else { print STDERR "Unknown option $opt.\n"; usage() } } if ($#ARGV < 0 || $#key_fields < 0) { usage() } my (%colindex, %db, %files_for_key, %file_cols, %file_key_locs); my (@allcols, %is_column, %all_keys); # remove leading and trailing whitespace and commas and # collapse multiple whitespace into one. sub clean { my $x = $_[0]; chomp $x; $x =~ s/\s*$//g; $x =~ s/\s\s+$/ /g; $x =~ s/^\s*//g; $x =~ s/,*$//g; $x =~ s/^,*//g; return $x; } # undef $/; # # Read the input files # for (my $i = 0; $i <= $#ARGV; $i++) { my $file = $ARGV[$i]; my $file_out = $file; $file_out =~ s/\.csv$//; open my $fileh, '<', $file; if (!defined($fileh)) { print STDERR "Could not open $file\n"; exit 1; } bless $fileh, "IO::File"; my $row = $csv->getline($fileh); if (!defined($row)) { print "Could not read CSV header line from $file\n"; # print <$csv_file>; exit 1; } #my @lines = split /\r\n?|\n/, $file_data; #my $hdr = $lines[0]; #if (!$csv->parse($hdr)) { #print STDERR "Illegal syntax in $file line 1: $hdr"; #exit 1; #} my @colnames = @{$row}; my %ci; my @key_locs; for (my $j = 0; $j <= $#colnames; $j++) { my $col = $colnames[$j]; if (!$is_column{$col}) { $is_column{$col} = 1; push @allcols, $col; } # check whether this is a key column and track where it is in this file my $ki = $key_index{$col}; if (defined($ki)) { $key_locs[$ki] = $j } } my $skip = 0; # is this file broken? for (my $j = 0; $j <= $#key_fields; $j++) { if (!defined($key_locs[$j])) { print STDERR "File $file is missing column \"$key_fields[$j]\", skipping.\n"; $skip = 1; } } if ($skip) { next } my %rows; my $lineno = 1; for (my $lineno = 1; 1; $lineno++) { my $row = $csv->getline($fileh); if (!defined($row)) { last } my @data = @{$row}; foreach (my $k = 0; $k <= $#data; $k++) { $data[$k] = &clean($data[$k]) } my $key = ''; for (my $ki = 0; $ki <= $#key_fields; $ki++) { $key .= ':'; if (!defined($key_locs[$ki])) { print STDERR "key field $ki is missing!? ($key_fields[$ki])\n"; } $key .= $data[$key_locs[$ki]]; } #print STDERR "Adding $key\n"; $rows{$key} = \@data; if (!defined($files_for_key{$key})) { $files_for_key{$key} = [()]; } my @files = @{$files_for_key{$key}}; push @files, $file_out; $files_for_key{$key} = \@files; } $db{$file_out} = \%rows; $file_key_locs{$file_out} = \@key_locs; my %col_loc; for (my $i = 0; $i <= $#colnames; $i++) { my $col = $colnames[$i]; $col_loc{$col} = $i; #print STDERR "file $file : col $col at $i\n"; } $file_cols{$file_out} = \%col_loc; close($fileh); } my @keys; foreach my $k (keys %files_for_key) { push @keys, $k; } @keys = sort {$a cmp $b} @keys; print STDERR ($#keys + 1), " distinct keys seen.\n"; # # Generate output # $csv->combine(@allcols, "Source"); print $csv->string(), $nl; foreach my $key (@keys) { # print STDERR "Looking at $key", $nl; my @files = @{$files_for_key{$key}}; my %rows; my $files_list = "@files"; my $firstfile = $files[0]; my @row = (); if (defined($req_source)) { my $sel = 0; foreach $f (@files) { if ($f eq $req_source) { $sel = 1 } } if (!$sel) { next } } foreach my $f (@files) { my $file_rows = $db{$f}; $rows{$f} = $file_rows->{$key}; } # Merge the data for this column from all files for (my $c = 0; $c <= $#allcols; $c++) { my $col = $allcols[$c]; # print STDERR "Doing $col for $key\n"; my $any_conflict = 0; my $curval; foreach my $f (@files) { my $col_loc = $file_cols{$f}; if (!defined($col_loc->{$col})) { next } # print STDERR "In $f this is at column $col_loc->{$col}\n"; my $val = $rows{$f}->[$col_loc->{$col}]; #print STDERR "$key: Looking for $col in $f at $col_loc->{$col}, found $val\n"; if (!defined($curval)) { $curval = $val } elsif ($curval ne $val) { if (!$any_conflict) { $curval = "$firstfile:$curval"; $any_conflict = 1; } $curval .= " $f:$val"; } } $row[$c] = $curval; #print STDERR "Setting $c ($col) to $curval\n"; } $csv->combine(@row, $files_list); print $csv->string(), $nl; }