package LDMS::DataText;

# DataText.pm
#
# Handles the <DATATEXT> division of each title.
#
# $Header: /home/LDMS/cvsroot/LDMS/LDMS/DataText.pm,v 1.22 2000/12/01 19:18:07 jcl53 Exp $
#
# TODO
#   Get a better regex for detecting headers.

use strict;
use warnings;
use lib '..';  # Make sure all modules are included AFTER this.
use LDMS::Footnote;
use LDMS::FootRef;
use LDMS::State;
use LDMS::XMLOut;
require 5.005;


## Module initializer.
BEGIN {
    use Exporter ();
    our ($VERSION, @ISA, @EXPORT, @EXPORT_OK, %EXPORT_TAGS);

    # Set our version (for module version checking).
    $VERSION = do {my @r = (q$Revision: 1.22 $ =~ /\d+/g); sprintf "%d."."%02d" x $#r, @r};

    @ISA = qw(Exporter);

    # Exported functions.
    @EXPORT = qw(&initDataText &getDataTextName &getDataTextBody &getDataTextLines &processDataText);
    %EXPORT_TAGS = ( );

    # Exported global variables and optional exported functions.
    @EXPORT_OK = qw();

}
our @EXPORT_OK;

# Non-exported global variables.


# Initialize exported global variables.


# Initialize non-exported global variables.


# Create private global variables.
my @currentDataText = ();


# Prototype functions.
sub initDataText(@);
sub getDataTextBody;
sub getDataTextName;
sub getDataTextLines;
sub processDataText(@);
sub processDivDataText(@);
sub startDataText($$);
sub endDataText();


## Module destructor.
END { }


## initDataText - frontend for grabbing a DataText block.
#
# Preconditions:
#   - Requires an array of lines to be passed i.
#
# Postconditions:
#   - Array of lines is stored in @currentDataText for future processing.
#

sub initDataText(@)
{
    
    # Get DataText block from arguments.
    @currentDataText = @_;

}


## getDataTextName - Checks to see if next line is a header.
#
# Preconditions:
#   - None, though @currentDataText should have something in it.
#
# Postconditions:
#   - Returns next line if it's a header.
#   - Returns "" if next line is not a header.
#

sub getDataTextName {

    my $nextLine = "";

    if (scalar(@currentDataText) > 0) {

	# It's a DATATEXTNAME if
	#   a.  It has no lowercase letters.
	#   b.  It starts with an uppercase letter.
	#   c.  There are at least two uppercase letters consecutively.
	if (($currentDataText[0] !~ /[a-z]/) &&
	    ($currentDataText[0] =~ /^\s*[A-Z]/) &&
	    ($currentDataText[0] =~ /[A-Z][A-Z]/))
	{

	    $nextLine = shift(@currentDataText);

	}
	
    }

    return $nextLine;

}


## getDataTextBody - Returns next line of text.
#
# Preconditions:
#   - None, though try to make sure @currentDataText has something in it.
#
# Postconditions:
#   - Returns next line of text, if present.
#   - Returns "" if @currentDataText is empty.
#
sub getDataTextBody {

    my $nextLine = "";

    if (scalar(@currentDataText) > 0) {

	$nextLine = shift(@currentDataText);
	
    }

    return $nextLine;

}


## getDataTextLines - Returns the current text block.
#
# Preconditions:
#   - None, though it's useless without data in @currentDataText.
#
# Postconditions:
#   - Returns @currentDataText, come Hell or high water.
#

sub getDataTextLines {
    return @currentDataText;
}



## processDataText - Processes DataText in one swell foop.
#
# Preconditions:
#   - Requires an array of strings to be passed in.
#
# Postconditions:
#   - Tags up all present DataText.
#   - Passes on subdivisions to &processDivDataText.
#

sub processDataText(@) {

    @currentDataText = @_;

    my $current = "";  # Mini state machine.
    
    &BeginTag("DATATEXT");
    &WriteString("\n");

    my $datatextLine = "";
    my @divDataText = ();

    while (scalar(@currentDataText) > 0) {
	$datatextLine = &getDataTextName;
	if ($datatextLine ne "") {
	    
	    if (scalar(@divDataText) > 0) {
		&processDivDataText(@divDataText);
		@divDataText = ();
	    }
	    if ($current ne "DATATEXTNAME") {
		if ($current ne "") {
		    &EndTag("DATATEXT");
		    &WriteString("\n");
		    &BeginTag("DATATEXT");
		    &WriteString("\n");
		}
		&BeginTag("DATATEXTNAME");
		&WriteString("\n");
		$current = "DATATEXTNAME";
	    }
	    
	    &WriteString("$datatextLine\n");
	    
	}
	else {
	    
	    if ($current ne "DATATEXTBODY") {
		if ($current eq "DATATEXTNAME") {
		    &EndTag("DATATEXTNAME");
		    &WriteString("\n");
		}
		$current = "DATATEXTBODY";
	    }
	    push(@divDataText, &getDataTextBody);
	}
    }
    
    if ($current eq "DATATEXTNAME") {
	&EndTag("DATATEXTNAME");
	&WriteString("\n");
    }

    if (scalar(@divDataText) > 0) {
	&processDivDataText(@divDataText);
	@divDataText = ();
    }

    &EndTag("DATATEXT");
    &WriteString("\n");

}


## processDivDataText - Processes divisions within DataText.
#
# Preconditions:
#   - Requires an array of strings to be passed in.
#
# Postconditions:
#   - Any divisions are tagged and labeled.
#   - Anything else is passed on to the footnote module.
#

sub processDivDataText(@) {

    my @divDataText = @_;
    my $divFlag = 0;       # Are we actually in a DIVDATATEXT yet?
    my $number = 0;        # Sequential position of current DIVDATATEXT.
    my @buffer = ();       # Output buffer.
    my $footnotes = 0;     # Do we have footnotes?

    for (my $i = 1; $i < scalar(@divDataText); $i++) {

	if ($divDataText[$i] =~ /\(FOOTNOTE\s\d+\)/) {
	    
	    $footnotes = 1;

	}

    }
    
    while (scalar(@divDataText) > 0) {
	
	# Check to see if we're beginning a new division.
	if (($divDataText[0] =~ /^\s+(\d+)\.\s/) ||        # 1.
	    ($divDataText[0] =~ /^\s+(\w)\.\s/) ||         # A. or a.
	    ($divDataText[0] =~ /^\s+(\d+\w)\.\s/) ||      # 1a. 
	    ($divDataText[0] =~ /^\s+([MCLXVI]+)\.\s/) ||  # I.
	    ($divDataText[0] =~ /^\s+([mclxvi]+)\.\s/) ||  # i.
	    ($divDataText[0] =~ /^\s+(\d\d\d\d)\s-\s/) ||  # 2000 - 
	    ($divDataText[0] =~ /^\s+\(([a-z])\)\s/) ||    # (a) 
	    ($divDataText[0] =~ /^\s+\(([A-Z])\)\s/))      # (A) 
	{

	    # Flush output buffer to footnote module.
	    while (scalar(@buffer) > 0) {

		# Output.
		if ($footnotes == 1) {
		    &tagFootnotes(@buffer);
		    @buffer = ();
		}
		else {
		    &WriteString(shift(@buffer) . "\n");
		}
		
	    }
	    
	    if ($divFlag >= 1) {
		
		&endDivDataText();
		
	    }
	    else {
		
		$divFlag = 1;
		
	    }
	    
	    &startDivDataText($1, $number);
	    $number++;
	    
	}  # if (long list of regexes)
	else {
	    
	    if ($divFlag <= 0) {
		
		&startDivDataText("", $number);
		$number++;
		
		$divFlag = 1;
		
	    }
	    
	}  # else (long list of regexes)

	# Kick any output to output buffer.
	push(@buffer, shift(@divDataText));	
	
    }  # while (scalar(@divDataText) > 0)

    # Empty output buffer, if necessary.
    while (scalar(@buffer) > 0) {
	
	if ($footnotes == 1) {
	    &tagFootnotes(@buffer);
	    @buffer = ();
	}
	else {
	    &WriteString(shift(@buffer) . "\n");
	}
	
    }

    # Add an extra fencepost, if we were in a DIVDATATEXT block.
    if ($divFlag >= 1) {
	
	&endDivDataText();
	
    }

}


## startDivDataText - Begins a new DataText division.
#
# Preconditions:
#   - Requires two string arguments:  name and hlevel.
#
# Postconditions:
#   - A DIVDATATEXT tag is sent to output buffer.
#   - Attribute "name" is set.
#   - Attribute "vlevel" is set.
#   - Attribute "hlevel" is set.
#

sub startDivDataText($$) {

    &BeginTag("DIVDATATEXT");
    &WriteString("\n");
    &AddAttribute("name", shift);
    &AddAttribute("vlevel", &getVLevel());
    &AddAttribute("hlevel", shift);

}


## endDivDataText - Ends a DataText division.
#
# Preconditions:
#   - None, save that there's an output buffer.
#
# Postconditions:
#   - Ends a DIVDATATEXT tag in the output buffer.
#

sub endDivDataText() {

    &EndTag("DIVDATATEXT");
    &WriteString("\n");

}


1;
__END__;

=pod 
$Log: DataText.pm,v $
Revision 1.22  2000/12/01 19:18:07  jcl53
11th-hour commenting.  It is now possible to navigate this module and not lose your grip on sanity.

Revision 1.21  2000/11/30 16:19:27  jcl53
Upped the minimum Perl interpreter version from 5.002 to 5.005, just in case someone out there does something like trying to run this script on a non-Unix system...

Revision 1.20  2000/11/28 23:34:56  jcl53
This version only uses footnote module if it sees footnotes.

Added the following item label styles:
	(A)
	(a)

Revision 1.19  2000/11/28 18:28:30  jcl53
Added code to use footnote module in LDMS::DataText.
Fixed a few syntax problems introduced into the state machine.

Revision 1.18  2000/11/28 16:18:47  jcl53
Since we have a table handler now, I removed all the table-detection code from this module.  Speeds it up a bit.

Revision 1.17  2000/11/21 21:01:15  jcl53
Added the following line to all modules to make including modules less painful:

	use lib '..';

If you're using other modules, include them AFTER this line.  Thanks.

Revision 1.16  2000/11/18 20:09:19  jcl53
The DATATEXTHEAD detection has become a bit more sophisticated.  It now checks for two or more consecutive capital letters as well.  At this point, it looks like we're down to the special cases.

TO DO
-----
Figure out if the regularity of the table delimiting lines can be used to our advantage.

Revision 1.15  2000/11/18 14:52:03  jcl53
Added the requirement for whitespace after the last delimiter of a list item label.  Otherwise, we tend to pick up things like "R.S." which are definitely NOT item labels!

Added the following item label styles:
	1a.

Revision 1.14  2000/11/18 14:19:25  jcl53
Recognition patterns are now much more specific, so there are virtually no false positives.  Unfortunately, we're still missing things.

List of...erm...lists that we recognize:
	A.
	a.
	1.
	2000 -

More to be added...

Revision 1.13  2000/11/17 23:19:40  jcl53
Rudimentary DIVDATATEXT handling.  And I do mean "rudimentary".  It does a good job of encapsulating things in the tags and ignoring tables, but tends to hit false positives in terms of looking for lists.

TO DO
-----
Implement a better searching mechanism.  This one's decent, but not good enough.

Revision 1.12  2000/11/17 20:50:51  jcl53
Moved the major DataText processing into the DataText.pm module, in preparation for chaining the tagging process.

Revision 1.11  2000/11/11 18:48:58  jcl53
Another restriction added on DATATEXTNAME.  Excluding preceding whitespace, it must start with a capital letter.

I'm getting the sinking feeling this might require checking to see if the damn thing is centered or not...  Trivial, but possibly unreliable, requiring lots of special cases if their centering is off sometimes...

Revision 1.10  2000/11/11 18:25:16  jcl53
More specificity added.  Now, a DATATEXTNAME is defined by the following:
	a)  Has no lowercase letters.
	b)  Has at least one uppercase letter.

I think this is about as specific as I can make it without missing a DATATEXTNAME somewhere.  It now excludes those pesky table boundaries and lines with one non-alphanumeric (punctuation especially!) on them.

Revision 1.9  2000/11/11 06:19:03  jcl53
DataText.pm now supports the getDataTextLines function, which returns the current set of lines it's working on.  This is quite handy for, say, iterating over a DATATEXT block, no?

Revision 1.8  2000/11/11 01:44:12  jcl53
Reverted the commenting changes.  (The re-prototyping broke the build.)

Comments will be added once the 0.3 build is frozen.

Revision 1.7  2000/11/10 19:22:18  bww3
Commenting additions to the DataText.pm

Revision 1.6  2000/11/10 03:08:42  jcl53
The DataText module includes cleanly.  That makes all six!  Yay!

Now, to use the functions to do something useful...  This is gonna hurt...

Revision 1.5  2000/11/10 01:54:08  jcl53
Corrected a few odd things in the logs, such as the status messages left over from the template (oops).

Revision 1.4  2000/11/10 01:41:56  jcl53
Renamed InputModule.pm to Input.pm just because I thought it looked neat.

In fact, Input.pm has been confirmed to be loadable as a module.  The rest--I'm working on...

Speaking of the rest, the actual code from the separate script files has been C-x i'ed into the corresponding module file.  (For you non-emacs folks out there, that means they've been inserted.)  These AREN'T confirmed to work as modules.  As I said, I'm working on them...

On a final note, the 'my' keyword is our friend, folks.  Use it like Mountain Dew on an all-nighter.

Revision 1.3  2000/11/09 20:55:07  jcl53
Added skeleton Perl module files to the appropriate directory.

Revision 1.2  2000/11/09 20:04:31  jcl53
Potential root module namespace is LDMS::XML.  All other modules will descend from this namespace (LDMS::XML::Foobar, for example).  Note that this is merely a provisional namespace.

Revision 1.1  2000/11/07 20:55:56  jcl53
The DataText module breaks down DataText components into a header (DataTextName) and other data.  Each pair of DataTextName and other data is encapsulated in its own DataText tag.  Only one DataTextName block per DataText tag.
