#!/usr/bin/perl -w -T
#
# $Header: /home/LDMS/cvsroot/LDMS/LDMS.pl,v 1.30 2000/11/28 18:27:33 jcl53 Exp $


#==============================================================================
# ENVIRONMENT SETTINGS

$ENV{PATH} = '/bin:/usr/bin:/usr/local/bin';  # Set a safe default PATH.
require 5.005;  # Safe minimum version.  Change at your own risk.
use strict;     # Make sure we declare variables, &c.
use lib '.';    # Look for modules in current directory, too.


#==============================================================================
# IMPORTED MODULES

use Getopt::Long;           # Command-line option parser.
use File::Basename;         # Filename munger.
use File::Spec::Functions;  # Platform-independent path construction.
use LDMS::DataText;         # LDMS DataText blocks.
use LDMS::Date;             # LDMS DATE tag.
use LDMS::DivSource;        # LDMS DIVSOURCE handler.
use LDMS::Error;            # LDMS error messages.
use LDMS::Footnote;         # LDMS footnote handler.
use LDMS::FootRef;          # LDMS FOOTREF tag.
use LDMS::Input;            # LDMS input file parser.
use LDMS::LP;               # LDMS CITE and EXPCITE.
use LDMS::State;            # LDMS state machine.
use LDMS::Status;           # LDMS status messages.
use LDMS::Table;            # LDMS table handler.
use LDMS::XMLOut;           # LDMS output file handler.


#==============================================================================
# MODULE SETTINGS

## Getopt::Long
Getopt::Long::Configure(qw/auto_abbrev permute bundling pass_through/);


#==============================================================================
# LOCAL VARIABLES

# Command-line options.
my $opt_help = 0;     # Show help message and exit?  (Default:  no.)
my $opt_verbose = 0;  # Verbosity?  (Default:  no.)
my $opt_output = "";  # Set output filename?  (Default:  derived from input.)
my $opt_force = 0;    # Force output file clobbering?  (Default:  no.)
my $opt_status = 2;   # Maximum depth to report status?  (Default:  2.)

my $inputFilename = "";     # Input filename.
my $outputFilename = "";    # Output filename.
my $fh = "";                # Filehandle placeholder.
my @blockifiedTitle = ();   # Title, divided up into -CITE- blocks.
my @blockArray = ();        # Single -CITE- block, as an array of lines.
my $lineCounter = 0;        # Number of lines processed.
my $lastStatus = 0;         # Line number of last status message printed.


#==============================================================================
# MAIN SCRIPT

## Extract and parse command-line options
GetOptions('help|h|?' => sub {&showHelp(); exit},
	   'verbose|v+' => \$opt_verbose, 
	   'output|o=s' => \$opt_output,
	   'force|f' => \$opt_force,
	   'status|s=i' => \$opt_status
	   );


## Get input filename, and validate, if necessary.
if (scalar(@ARGV) > 0) {
    $inputFilename = shift(@ARGV);
}
else {
    &showHelp();
    print STDERR "No input file specified!\n";
    exit;
}
if (!(-r $inputFilename)) {
    &showHelp();
    print STDERR "Input file \'$inputFilename\'is not readable.\n";
    exit;
}


## Display help message, if required.
if (scalar(@ARGV) > 0) {  # Show help if there's still junk at command line.
    &showHelp();
    print STDERR "Bad parameter \'$ARGV[0]\'!\n";
    exit;
}


## Set output filename.
if ($opt_output ne "") {
    $outputFilename = $opt_output;
}
else {
    $outputFilename = $inputFilename;
    $outputFilename =~ s/\.\w*$/\.xml/g;   
}


## If the output filename is a directory, slap '.xml' on input and write there.
if (-d $outputFilename) {
    $outputFilename = catfile($outputFilename, &basename($inputFilename));
    $outputFilename =~ s/\.\w*$/\.xml/g;
}


## Make sure we're not clobbering nuthin'--unless forced to.
if ((-e $outputFilename) && ($opt_force <= 0)) {
    &showHelp();
    print STDERR "Output file \'$outputFilename\' already exists!\n";
    exit;
}


## Check our write permissions.
if ((-e $outputFilename) &&
    !(-w $outputFilename)) {
    &showHelp();
    print STDERR "Output file \'$outputFilename\' is not writeable!\n";
    exit;
}


## If verbose, tell what the final output filename ended up being.
if ($opt_verbose > 0) {
    print STDERR "XML output to:  $outputFilename\n";
}


## Get our input filehandle
$fh = &openFile($inputFilename);


## Blockify the title.
@blockifiedTitle = &readBlock($fh);


## Close input file.
close $fh;


## Initialize the output writer.
$fh = &Initialize($outputFilename);


# Zero the state machine (shouldn't need to be done, but still good).
&initState();


## Write an LDMS tag as our top-level document tag.
&BeginTag("LDMS");
&WriteString("\n");


## Process the title, in blocks demarcated by -CITE-...
for (my $i = 1; $i < scalar(@blockifiedTitle); $i++) {

    if ($opt_verbose >= 1) {
	if ((&getVLevel() > 0) && 
	    ($opt_status > 0) && 
	    (&getVLevel() <= $opt_status)) {

	    print STDERR "Processing " . &getLevelLabel() . " " . &getHLevel() . ".\n";

	}
    }

    my $currentBlock = $blockifiedTitle[$i];

    # Get rid of the flanking -CITE- tags, and hose the leftover newline.
    $currentBlock =~ s/\-CITE\-\s*\n//g;

    # Convert block to an array.  (Splits on LF or CR/LF.)
    @blockArray = split(/\r*\n/, $currentBlock);

    # Only do everything else if the block is not empty.
    if (scalar(@blockArray) > 0) {

	my $sectionLabel = "";  # What this title calls this division.
	
	# Grab the name of this structural division from -HEAD-.
	my $headline = "";
	my $headflag = 0;
	foreach $headline (@blockArray) {
	    if ($headline =~ /^\s*\-HEAD\-/) {
		$headflag = 1;
		next;
	    }
	    if ($headflag == 1) {
		if ($headline ne "") {
		    my @section = split(/\s+/, $headline);
		    $sectionLabel = $section[1];
		}	      
		last;
	    }
	}  # foreach $headline (@blockArray)
	undef $headline;
	undef $headflag;

	# Begin new structural division.
	if (&getVLevel() < 0) {  # Nothing in @path, assume top-level.
	   &descendLevel($sectionLabel);
	   &startStructdiv();
	}
	elsif (&getLevelLabel() eq $sectionLabel) {  # Same level.
	    &endStructdiv();
	    &startStructdiv();
	}
	else {  # Go to proper level, or make a new one, if necessary.
	    if (&labelInPath($sectionLabel) > 0) {
		while (&getVLevel() >= 0) {
		    if (&getLevelLabel() eq $sectionLabel) {
			&endStructdiv();
			&startStructdiv();
			last;
		    }
		    else {
			&endStructdiv();
			&ascendLevel();
			next;
		    }
		}
	    }
	    else {
		&descendLevel($sectionLabel);
		&startStructdiv();
	    }
	}

	# Start our TITLEDATA tag.
	&BeginTag("TITLEDATA");
	&WriteString("\n");

	# Begin the NAVGROUP tag.
	&BeginTag("NAVGROUP");
	&WriteString("\n");
	
	# Markup -CITE- dashline.
	&markupCite();
	&fastForward();

	# Process -EXPCITE- dashline.
	if ($blockArray[0] =~ /^\-EXPCITE\-/) {
	    
	    # Hose the EXPCITE dashline.
	    shift(@blockArray);

	    &markupExpcite();
	    &fastForward();
	}

	# Process -HEAD- tag.
	&processField("HEAD");

	# End the NAVGROUP tag.
	&EndTag("NAVGROUP");
	&WriteString("\n");

	# List of top-level fields that can be handled generically.
	# NOTE:  This will prob'ly expand to include all fields!
	my @fields = qw/STATUTE SOURCE STATAMEND MISC1 REFTEXT MISC2 COD MISC3 CHANGE MISC4 TRANS MISC5 EXEC MISC6 CROSS MISC7 SECREF MISC8/;

	# Iterate over our field list and process each field.
	my $field = "";
	foreach $field (@fields) {
	    &processField($field);
	}
	undef $field;
	undef @fields;

	# If there's anything left, speak now, or forever hold the pieces.
	while (scalar(@blockArray) > 0) {
	    &WriteString(shift(@blockArray) . "\n");
	}

	# Close the TITLEDATA tag.
	&EndTag("TITLEDATA");
	&WriteString("\n");

    }
    # if (scalar(@blockArray) > 0)
    
}
# for (my $i = 1; $i < scalar(@blockifiedTitle); $i++)


## End all outstanding STRUCTDIV tags.
while (&getVLevel() >= 0) {
    &endStructdiv();
    &ascendLevel();
}

## Pop in the ending LDMS tag.
&EndTag("LDMS");


## Finalize the XML document.
&Finalize($fh);


#==============================================================================
# START STRUCTDIV TAG
=pod

=head2 startStructdiv()

Produces a <STRUCTDIV> tag that starts at the current hierarchical level.

=cut

sub startStructdiv() {

    &BeginTag("STRUCTDIV");
    &WriteString("\n");
    &AddAttribute("name", &getLevelLabel());
    &AddAttribute("vlevel", &getVLevel());
    &AddAttribute("hlevel", &getHLevel());

    &advanceLevel();

}


#==============================================================================
# END STRUCTDIV TAG
=pod

=head2 endStructdiv()

Produces a </STRUCTDIV> tag.

=cut

sub endStructdiv() {

    &EndTag("STRUCTDIV");
    &WriteString("\n");

}


#==============================================================================
# FAST-FORWARD
=pod

=head2 fastForward()

Fast-forwards to the next relevant block of text.

=cut

sub fastForward() {
    while ((scalar(@blockArray) > 0) && 
	   !($blockArray[0] =~ /^\-[A-Z]+[0-9]*\-/)) {
	&WriteString(shift(@blockArray) . "\n");
    }	
}


#==============================================================================
# FIELD EXTRACTOR
=pod

=head2 getField()

This takes everything up to the next dashline tag and puts it in the global variable @block.

=cut

sub getField() {

    # This takes everything up to the next dashline tag and puts it
    # in @block.
    my @block = shift(@blockArray);
    while ((scalar(@blockArray) > 0) &&
	   !($blockArray[0] =~ /^\-[A-Z]+[0-9]*\-/)) {
	push(@block, shift(@blockArray));
    }
    # while ($blockArray[0] =~ /^\s+$/) {
    
    # Put trailing empty line, if any, back on the block.
    if ($block[$#block] =~ /^\s+$/) {
	unshift(@blockArray, pop(@block));
    }

    return @block;
}


#==============================================================================
# -CITE- HANDLER
=pod

=head2 markupCite()

Takes lines in @blockArray and processes them with the LP module.

=cut

sub markupCite() {

    my $titleNumber = 0;

    # Begin a CITE block.
    &BeginTag("CITE");
    &WriteString("\n");
    
    # Flush any preceding blank lines to output.
    while ($blockArray[0] =~ /^\s*$/) {
	&WriteString(shift(@blockArray) . "\n");
    }
    
    # Grab title number.
    $titleNumber = &getCiteTitleNumber(@blockArray);
    
    # Add title number as an attribute to CITE tag.
    &AddAttribute("titlenumber", $titleNumber);
    
    # Add the actual title line to the CITE block.
    my $citeLine = shift(@blockArray);
    if ($citeLine =~ /(.*)(\d\d\/\d\d\/\d\d)(.*)/) {  # Check for date.

	&WriteString($1);
	&tagDate($2);
	&WriteString($3 . "\n");

    }
    else {

	&WriteString(shift(@blockArray) . "\n");

    }
    
    # End CITE block.
    &EndTag("CITE");
    &WriteString("\n");
    
}


#==============================================================================
# -EXPCITE- HANDLER
=pod

=head2 markupExpcite()

Extracts EXPCITE block from the current block, and processes it with the LDMS::LP module.

=cut

sub markupExpcite() {
    
    # Extract an EXPCITE block from our current block.
    my @expciteBlock = &getField();
    
    # Pass EXPCITE block to LP module.
    &initExpcite(@expciteBlock);
    
    # Start an EXPCITE tag.
    &BeginTag("EXPCITE");
    &WriteString("\n");
    &AddAttribute("level", &getExpciteLevel);
    
    # For each level, drop a DIVEXPCITE block.
    my $expciteLine = &getNextExpciteEntry;
    while ($expciteLine ne "") {
	&BeginTag("DIVEXPCITE");
	&WriteString("\n");
	&WriteString("$expciteLine\n");
	&EndTag("DIVEXPCITE");
	&WriteString("\n");
	$expciteLine = &getNextExpciteEntry;
    }
    # while ($expciteLine ne "")
    
    # Finish the EXPCITE block.
    &EndTag("EXPCITE");
    &WriteString("\n");
    
}


#==============================================================================
# GENERIC FIELD HANDLER
=pod

=head2 markupField($)

Many fields can be handled alike.  This function processes the fields with the appropriate module, if any.  It determines what field it's working on by checking the parameter passed in for the name.

=cut

sub markupField($) {

    my $fieldName = shift;
    my $usesDataText = 0;   # Does current block use DataText?

    # Handle "special" fields.

    # Extract this block.
    my @block = &getField();

    # Start FIELD block.
    &BeginTag($fieldName);
    &WriteString("\n");

    # Check if this field uses DataText.
    for (qw/MISC1 MISC2 MISC3 MISC4 MISC5 MISC6 MISC7 MISC8 REFTEXT COD CHANGE TRANS EXEC CROSS SECREF STATUTE STATAMEND/) {       
	if ($fieldName eq $_) {
	    $usesDataText = 1;
	}
    }

    # Further processing, dependent on expected type of contents.
    if ($usesDataText == 1) {  # For DataText blocks.
	
#	&processDataText(@block);
	&markUpTable(@block);
	@block = ();

    }
    elsif ($fieldName eq "SOURCE") {
    
	&tagDivSource(@block);
	@block = ();

    }

    # Insert remaining character data as-is.
    while (scalar(@block) > 0) {
	&WriteString(shift(@block) . "\n");
    }
    
    
    # End FIELD block.
    &EndTag($fieldName);
    &WriteString("\n");

}


#==============================================================================
# GENERIC FIELD PROCESSOR
=pod

=head2 processField($)

This function does preliminary handling for generic fields.  It basically takes care of the dashline and then passes whatever's left on to the markupField($) subroutine.

=cut

sub processField($) {

    my $fieldName = shift;

    if ((scalar(@blockArray) > 0) && 
	($blockArray[0] =~ /^\-$fieldName\-/)) {
	
	# Hose the field dashline.
	shift(@blockArray);

	&markupField($fieldName);
	&fastForward();
	
    }
    
}


#==============================================================================
# PRINT STATUS MESSAGES
=pod

=head2 showStatus()

Prints a status message, if necessary.

=cut

sub showStatus() {

    if ($opt_verbose > 0) {
    
	&initStatus($lineCounter);
	&printStatus();

    }

}


#==============================================================================
# HELP FUNCTION
=pod

=head2 showHelp()

Does exactly what it says.  Displays the current help text.

=cut

sub showHelp() {

    print STDERR <<"EndHelpText";

Usage:  $0 -hvo <OUTPUT> [-] <FILE>

  Options:
    -h, -?, --help                    Show this help message.
    -v, --verbose                     Enable verbose output.
    -o <OUTPUT>, --output <OUTPUT>    Write to file <OUTPUT>.
    -f, --force                       Overwrite existing files.
    -s, --status                      Set maximum status reporting depth.


EndHelpText

}


#==============================================================================
# INTERNAL DOCUMENTATION
=pod

=head1 NAME

LDMS.pl - converts a title of the USC into XML

=head1 SYNOPSIS

B<./LDMS.pl> [I<OPTIONS>]... [I<FILE>]...

=head1 DESCRIPTION

Converts the given filename from 8-bit ASCII into XML, as per the LDMS DTD.

B<-h, -?, --help>          Show this help message.

B<-v, --verbose>           Enable verbose output.

B<-o, --output [FILE]>     Write to FILE.

B<-f, --force>             Overwrite existing files.

B<-s, --status>            Set maximum status reporting depth.

=head1 AUTHORS

=over 5

The LDMS Team:

=item *

Ju Joh

=item *

Sylvia Kwayke

=item *

Jason Lee

=item *

Nidhi Loyalka

=item *

Omar Mehmood

=item *

Charles Shagong

=item *

Brian Williams

=back

=head1 FILES

LDMS.dtd     Defines tags used in translating to XML.

=head1 NOTES

As this script was written in Perl by several different developers, the motto "There's more than one way to do it!" may be quite evident in places.  Nevertheless, we've tried to make the code easy to modify and maintain...

We hope we succeeded.

=cut


__END__;


$Log: LDMS.pl,v $
Revision 1.30  2000/11/28 18:27:33  jcl53
Added the status granularity setting.  Defaults to '2', or the 'Sec.' level in Title 1.

Footnotes still don't work properly.  I'm tagging the current build as 1.0, but I expect we'll have the footnotes working by delivery.

Revision 1.29  2000/11/28 16:21:22  jcl53
Integrated Table.pm into the main script.  We now have table support...  Unfortunately, marked-up tables are not human-parseable anymore.  That's the legacy of HTML, I suppose.  *sigh*

Revision 1.28  2000/11/28 15:58:51  jcl53
Integrated Date module.  To see its work, look at the <DATE> tags in the <CITE> blocks.

Revision 1.27  2000/11/28 15:03:17  jcl53
Integrated LDMS::DivSource.

Revision 1.26  2000/11/20 22:08:46  jcl53
Added pod documentation.

Added a lot of internal commenting.

Fixed a few functions so they're prototyped instead of "open".

Revision 1.25  2000/11/18 20:07:51  jcl53
Fixed a misnamed function call.  Oops.

Revision 1.24  2000/11/18 15:16:17  jcl53
Added the <TITLEDATA> and <NAVGROUP> tags.  *yawn*

Revision 1.23  2000/11/17 23:15:13  jcl53
Script only checks if destination is writeable if it exists.  This might get a bit more sophisticated later on.

Revision 1.22  2000/11/17 20:50:37  jcl53
Moved the major DataText processing into the DataText.pm module, in preparation for chaining the tagging process.

Revision 1.21  2000/11/17 20:06:03  jcl53
The main script now uses State.pm for all its labeling needs.

Revision 1.20  2000/11/16 20:10:19  jcl53
Added checking to see if we can write to the output file.  If not, we yell at the user and exit.

Revision 1.19  2000/11/16 03:32:17  jcl53
Erm...  Removed the debug output from the STRUCTDIV tagging functions.

Also added (just 'cause I thought I might need it later) the ability to display the output filename with the --verbose option.

Revision 1.18  2000/11/16 03:08:28  jcl53
We now have complete support for the <STRUCTDIV> tag.

NOTES
-----
Our specification requires the hlevel and vlevel to start at 0.  However, since lawyers seem to start counting at 1 (or "A", or what have you), I'd argue that perhaps we should start these at 1 as well...

Revision 1.17  2000/11/15 04:40:36  jcl53
LDMS.pl now has the first elements of support for the STRUCTDIV tag--namely a state machine stack and functions that handle the tagging automagically.  Right now, it just tags the topmost level (i.e. "Title"), but I'm working on making it figure out levels from the -HEAD- field.  Once that's done, the rest is trivial...

Revision 1.16  2000/11/15 01:59:50  jcl53
I got bored, so I added some creature comforts to the LDMS.pl main loop.  If the output filename is a directory, output will be deposited in a file derived from the input filename into the specified directory.

TO DO
-----
Add checking to see if we're allowed to write to the destination directory, whether specified or not.

Revision 1.15  2000/11/15 01:28:09  jcl53
Added the -o (--output) option to specify an output filename (and path, by extension).

Added checking to see if we're clobbering an existing file with our output.

Added the -f (--force) option to allow the user to ignore our sanity checks and clobber existing files.

Migrated the Getopt::Long setup and parsing to the new (as in, "post-Y2K") format.

Verbose flag (following the example set by the other option flags) is now in $opt_verbose.  Zero means absolutely quiet.  One (or more!) means we provide status output, &c.

Revision 1.14  2000/11/14 21:20:33  jcl53
Added information on exactly where to put the filename of the input file.

Bit the bullet and simplified the main loop to an iteration over field names.  This assumes (as it should!) that list ordering in Perl is preserved.

TO DO
-----
Try to shoehorn <CITE> and <EXPCITE> into the generic field processor.  The less code duplication, the better!

Revision 1.13  2000/11/11 17:37:52  jcl53
This revision (I'm tagging it as alpha_0_3_1) now uses Brian's new-and-improved XMLOut module.  Therefore, it now adds attributes!

TO DO
-----
Comment the whole damn thing... *sigh*

Revision 1.12  2000/11/11 15:17:03  jcl53
Now looks for modules in current directory, as opposed to my own directory (which could cause problems for the rest of you).

Revision 1.11  2000/11/11 06:38:26  jcl53
0.3 candidate now does proper subdivision of data into separate DATATEXT tags, instead of putting all the DATATEXTNAME and #PCDATA into a single DATATEXT tag.  Only one DATATEXTNAME per DATATEXT, and a new DATATEXTNAME starts a new DATATEXT.

Although XMLOut.pm is not integrated due to problems with attributes, I'm going to freeze this as version 0.3.  Good night, everyone!

Revision 1.10  2000/11/11 06:19:53  jcl53
The main LDMS script now does DATATEXT and its associated DATATEXTNAME tag.

Revision 1.9  2000/11/11 04:18:35  jcl53
Put those generic field functions to work.  Now, just about every field is tagged!  Next up:  working in a bit of DataText...

Revision 1.8  2000/11/11 03:44:03  jcl53
The Wunderscript has functions to support basic field operations, such as extracting them and fast-forwarding to the next one.  It also tags up -HEAD- fields.

Revision 1.7  2000/11/11 02:08:16  jcl53
Build 0.3 candidate now tags up CITE, EXPCITE, and DIVEXPCITE properly.  I'm working on getting it to tag all the other top-level dashlines.

Revision 1.6  2000/11/10 07:40:21  jcl53
Ugh...  Still trying to localize the attribute clumping in the XML output module.  I'm throwing in the towel for the night.  Back at about 8 AM.

Revision 1.5  2000/11/10 06:29:54  jcl53
Well, theoretically, this one marks up CITE blocks.  Unfortunately, all is not well in the LDMS::XMLOut module.  All the attribute tags are mushed together in the first tag...  I'm working on it.

Revision 1.4  2000/11/10 02:10:51  jcl53
The Amazing Technicolor XMLizer now outputs the input file to STDOUT.

Revision 1.3  2000/11/10 01:55:19  jcl53
The log messages are now set up as pod documentation.  Rejoice.

Revision 1.2  2000/11/10 01:36:15  jcl53
Look, everybody!  It's the first commit of the modular LDMS program!  Oh, joy!

This shows off the new modular design of the LDMS system by opening and closing a file...  Okay, so it's really not that spectacular--but consider it proof that we (or at least 'I', heh) know what we're doing...

Next on the block--Object-Oriented Perl!

*dodges the tomatoes and ducks back into emacs*

Revision 1.1  2000/10/26 20:41:39  jcl53
Main script for the LDMS Project.
