// bergmark - May, 2000 - Reference Linking Project

/**
 * XHTMLAnalyzer implements the RefLinkAnalyzer interface for XHTML
 * files.  It uses Sun's javax.xml.parsers package to parse the XHTML file.
 * Only one XHTMLAnalyzer is instantiated per Surrogate.
 * The document is input and analyzed using the SAXP method.
 */

// Updates:
// July 28, 2000:  back out DID, buildLocalMetaData should have nothing
//                 to do with BibDatas or Creations.  String, not MIMEfile.
// 2000-07-31: Pick up 2-line titles (<H2> followed by <H3>)
// 2000-08-01: Transition from title to author on either <p> or <center>
// 2000-08-03: Transition from author to contexts on "Abstract" etc.
// 2000-09-05: Drop the sentenceHasAnchor condition.  Not relevant.
// 2000-09-05: Try to handle the case where the reference string's tag
//             contains needed bibliographic data, as in
//             "Bearman, David and Jennifer Trant."
// 2000-09-21: Remove some code to AuthorSection
// 2000-09-28: Convert to Dublin Core and add DOI to local metadata
//             Allow for multiple display ids, multiple DOI's
// 2000-10-23: To enable correct comparison between title in <title>
//             element and title parsed from text strings, substitue
//             ' ' for '\n' and trim() both.
// 2000-11-01: In handleText, check every sentence in a multiple-sentence
//             chunk of text for references.  Fix bad bug in nextEOS
// 2000-11-02: Keep track of tags that signal presence of subtitle
// 2000-11-03: Move ordinal generation from here into ReferenceSection
// 2000-11-06: Add check to EOS to handle sentences ending with ".["
// 2000-11-15: Fix a bug involving grabAuthor that sucked in specious authors
//             Handle reference sections that do not have an introductory
//             tag in front of the first reference by adding firstReference
//             flag that is on at the beginning, but once off stays off
// 2000-12-07: Accomodate section headers in the form "<font size="+1">
//             <strong>.  This was simplified by designing a new class,
//             ContextSection.
// 2001-05-25: Corrected a bug in readInputString, where it would have
//             prematurely run off the end of the file
// 2001-05-30: Be more aggressive about finding a pub date by looking in
//             <meta> elements (containsDate and getDate routines)
// 2001-06-03: AuthorSection has been enhanced to do more of the work
// 2001-06-12: Enhance buildLocalMetaData to use analyzer's pubDate if
//             none was provided by constructor caller
// TBD: move most of the process text stuff out of parser into SentenceTree

// 2001-06-23: Implement buildCitationList()
//            BY: Yu Chen

// 2001-08-01:  Improve concatenation of contexts where there are False Stops
//              Move code to updateContextTrees.  (Maybe move to another
//              class.)

package Linkable.Analysis;

import Linkable.API.*;
import Linkable.Utility.*;

import org.xml.sax.*;          // needs xml-tr2/xml.jar in $CLASSPATH
import org.xml.sax.helpers.ParserFactory; // needs xerces.jar
import javax.xml.parsers.*;    // needs jaxp1.0.1/jaxp.jar in $CLASSPATH
// or the above will work with just xerces-1_1_3/xerces.jar in $CLASSPATH

import java.util.Vector;
import java.util.StringTokenizer;  // for parsing reference tags
import java.util.*;
import java.net.*;
import java.io.*;
import java.lang.reflect.Array;

public class XHTMLAnalyzer extends HandlerBase implements RefLinkAnalyzer {

   private static final String ME = "XHTMLAnalyzer: "; // for diagnostics
   private static final boolean DEBUG = CONFIG.DEBUG;

   // metadata for the document being analyzed
   private String title = null;
   private String pubDate = null;       // Set this if discovered during parse
   private Vector displayID=new Vector();
   private Author[] authors = null;     // Authors of this document
   private ReferenceSection rs = null;  // Reference Section of this doc
   private AuthorSection as = null;     // Author Section of this doc
   private ContextSection cs = null;     // Text section of this doe
   private Vector knownCitations = null;  // Fill up this list as we parse
   private Creation me = null;          // Fill up during parse
   private String localURL = "";        // Document to be parsed
   private String sourceURN = null;     // Fill in for build CiteRef DB

   private SAXParserFactory factory;    // Gets us a parser to start with

   // NOTE: a Jperl interface to adddoc::AddDocument in CiteSeer
   // would be useful here, if only Jperl worked...

   // Constructor
   /**
    * Default constructor creates a plain XHTMLAnalyzer.  It will not
    * actually perform any analysis until its setURL method is invoked.
    */
   public XHTMLAnalyzer ( ) {
     super();
     factory = SAXParserFactory.newInstance();
     if (DEBUG)
     System.err.println(ME+"constructed an XHTML parser");
     // It would have been nice to have the constructor proceed to
     // fill up the XHTMLAnalyzer's private fields, but we cannot
     // pass it a URL to read it, it being a HandlerBase.
   }

   // setURL -

   /** gets the URL of the Item to be analyzed, and proceeds
    *  to fill up local structures, partially cooked in some cases,
    * the contents of which can be returned on demand by the Surrogate
    * constructor.
    * An alternative to this approach would be to have lots of "set"
    * methods in the Surrogate object which we would set as we analyze.
    * These set methods would have to be public, though, which is not so
    * cool, since they are not part of the Surrogate API as we have defined
    * it.  Anyway, setURL opens a connection and then starts up the parser.
    * @param url is a string of the xhtml item to be analyzed
    * @param remoteURL is the location on the net of the original
    * item
    * @throws SurrogateException if the url cannot be opened for analysis.
    */
   public void setURL ( String url, String remoteURL )
   throws SurrogateException {
      if (DEBUG)
      System.err.println(ME+"in setURL, " + url + " (local), " +
	 remoteURL + " (remote) " );
      localURL = url;
      displayID.add ( remoteURL );

      // Open up URL connection and put a Reader on it
      URLConnection connection = openURL( url );
      InputStreamReader in = openConn ( connection );

      // Try parsing this URL, registering ourselves as the handler
      try {
	 SAXParser saxParser = factory.newSAXParser();
         saxParser.parse ( new InputSource((Reader)in), this );
	 in.close();
      } catch ( SAXParseException e ) {
         System.err.println (ME+"caught " + e.toString() + " while"
         + " parsing the document and finishing = " + finishing);
	 // Don't quit if the error fell in the bottom section
         if ( ! finishing ) {
	    System.err.println("Exception info:\n   Line Number: " +
	       e.getLineNumber() + "\n   Column Number: " +
	       e.getColumnNumber() + "\n   Public ID of entity in error : " +
	       e.getPublicId() + "\n   System ID of entity in error: " +
	       e.getSystemId() );
	    e.printStackTrace();
         }
      } catch ( Exception e ) {
        System.err.println (ME+"caught " + e.toString() + " while"
         + " parsing the document and finishing = " + finishing);
         // Don't quit if the error fell in the bottom section
         if ( ! finishing ) {
            e.printStackTrace();
         }
      }

      if (DEBUG)
      System.err.println(ME+"done parsing -- number of contexts is "
	 + (contextTrees==null?0:contextTrees.size())
	 + ", number of references is " + (rs==null?0:rs.refCount() ) );
   }

   // =============  Over-ridden SAX DocumentHandler methods ==========
   // (DcoumentHandler is one of the four SAX interfaces, and HandlerBase
   // extends this interface, so we need the methods here.)

   // HandlerBase also implements the ErrorHandler interface.  It ignores
   // warnings and throws SAXParseException for fatal errors.  We could
   // over-ride some of those methods (fatalError and warning) by registering
   // "this" via Parser.setErrorHandler() method.  Future Reference.

   //               startDocument
   /**         
    * Implements the startDocument interface of the DocumentHandler
    */
   public void startDocument() throws SAXException {
      rs = new ReferenceSection();
      String[] startAuthorList = {"p","center","strong","h5"};
      cs = new ContextSection();
      as = new AuthorSection(cs, startAuthorList);
      if (DEBUG)
      System.err.println(ME+"starting to parse document");
   }

   //               endDocument
   /**         
    * Implements the endDocument interface of the DocumentHandler
    */
   public void endDocument() throws SAXException {
      if (DEBUG)
      System.err.println(ME+"done parsing the document");
   }

   //               startDocument
   /**         
    * Implements the startDocument interface of the DocumentHandler
    */
   public void startElement ( String name, AttributeList attrs )
   throws SAXException {
      handleStartTag ( name, attrs );
   }

   public void endElement ( String name) throws SAXException {
      handleEndTag ( name );
   }

   //               characters
   /**         
    * Implements the characters interface of the DocumentHandler
    */
   public void characters (char buf[], int offset, int len)
   throws SAXException {
      handleText ( buf, offset, len );
   }

   // ----------  private routines to help the SAX parser ---------------

   // Variables having to do with finding sentences which are
   // reference contexts, e.g. sentences with "[10]" in them.
   private Vector contextTrees = new Vector();  // all contexts
   private SentenceTree sentence;            // current sentence.

   private boolean starting = true;          // state 1
   private boolean getDocTitle = false;      // state 2 (optional)
   private boolean startTitle = false;       // state 3
   private boolean moreTitle = false;        // (if state 3 should continue)
   private boolean startAuthor = false;      // state 4
   private boolean grabAuthor = false;       // state 5
   private boolean doContexts = false;       // state 6
   private boolean doingReferences = false;  // state 7
   private boolean grabReference = false;    // state 8
   private boolean finishing = false;        // state 9

   private boolean firstReference=false;

   private String startName = null;          // "h1" or "h2"
   private String moreName = null;          // "h3" or "font"
   private boolean notInTable = true;

   // The <TITLE> element usually contains the title.
   // The first <H1> or <H2> usually has the title, too.
   // In 1998, D-Lib papers had title inside <FONT size="+3"> elements
   // Compare with title found in the head.  If one is a substring of the
   // other, save the longer.
   // Sometimes the title is split across two lines, one an <H1> or <H2> and
   // the other an <H3>.  Look for a colon in the metadata title.
   // Sometimes title starts with <font size="+2"><strong> and the
   // subtitle is contained in <font size="+1">

   // Sometimes <meta> elements contain the date published.  If one
   // is discovered among the <meta> attributes, save it in pubDate.

   // Authors: they usually come in a <p> element after the title.
   // Author names are contained in <p>, <center>, or <strong> elements.
   // The Author section is terminated by an <H-> header.
   // Unless the authors are delimited by <h5> tags and the body's
   // section headers are delimited by <P><B> tags.

   // About tables.  Some papers (e.g. DLIB) format the entire paper
   // within a table.  Other papers (e.g. JEP) have authors, then a table
   // of contents, then the body of the paper.  So the rule of where can
   // Section Headers appear is this: if the table begins after we have
   // at least one author, a section header cannot be in there (notInTable
   // is false).

   protected void handleStartTag ( String name, AttributeList attrs ) {
      if (DEBUG) System.err.println ( ME + " got start tag " + name );
      if ( name.equals("table") && as.gotAuthors() ) {
	 if ( DEBUG ) System.err.println ( ME + " now in a table");
	 notInTable = false; as.setTable (false);
      }
      if ( starting == true && name.equals("title") ) {
	 getDocTitle = true;
	 if (DEBUG) System.err.println(ME+"getDocTitle goes to true");
      }
      else if ( starting == true && (name.equals("h2") ||name.equals("h1")
	 || (name.equals("font") && isSizeChange(attrs)) )  ) {
         startTitle = true; starting = false;
         if (DEBUG) System.err.println(ME+"Title? " );
	 startName = name;
      }
      else if ( starting == true && pubDate == null && name.equals("meta")
	 && attrs != null && attrs.getLength() != 0 ) {
	    String string1 = attrs.getValue("name");
	    String string2 = attrs.getValue("content");
	    if ( string1 != null && string2 != null ) {
	      if ( string1.equals("date") )
		 pubDate = MetaData.toOAMS(string2);
	      else pubDate = MetaData.containsDate(string2);
	    if (DEBUG && pubDate != null)
	       System.err.println(ME+"pubDate set to " + pubDate);
	    }
      }
      else if ( startTitle && moreTitle &&
	 ( name.equals("h3") || (name.equals("font") && isSizeChange(attrs)) ) ) {
         if (DEBUG) System.err.println(ME+"Subtitle?  moreName = " + name );
	 moreName = name;
      }
      else if ( startAuthor ) {
	 startAuthor=as.handleStartTag(name,attrs);
	 if ( !startAuthor ) {
	    doContexts = true;
            if (DEBUG) System.err.println(ME
	       +"startAuthor set false, doContexts set true");
	    cs.checkForHeader ( name, attrs );  // handle possible section
	 }
      }
      else if ( startAuthor && as.gotAuthors() && notInTable
		&& cs.handleStartTag(name, attrs) ) {
         startAuthor = false;
         doContexts = true;
         if (DEBUG) System.err.println(ME
	    +"startAuthor set false, doContexts set true");
      }
      else if ( doContexts ) {
	 cs.checkForHeader ( name, attrs );  // handle possible section
      }

      // Most references are delimited by <p> tags.  Sometimes the first
      // reference is not enclosed in <p>
      // Some by <dt>tag</dt><dd>reference string</dd>
      // Some by <ol><li>...</li>...</ol>  "rs" will generate "1." ...
      else if ( doingReferences ) {
	   if ( isAnH ( name ) ) {
	      doingReferences = false;
	      finishing = true;
	   } else {
	      if (name.equals("p") || name.equals("dt") || name.equals("li")) {
	         if (DEBUG)
	         System.err.println(ME+"scanning for references, got a <"
		 + name +">");
		 if ( firstReference ) { rs.endText(); firstReference=false; }
	         grabReference=true;
              }
              else if ( name.equals("ol") ) rs.generateOrdinals();
	   }
      } // doingReferences

   } // handleStartTag

   protected void handleEndTag ( String tag ) {
      if (DEBUG) System.err.println ( ME + " got end tag " + tag );
      if ( tag.equals("table") ) { notInTable = true; as.setTable (true); }
      if (getDocTitle && tag.equals("title") ) {
	 getDocTitle = false;
	 if (DEBUG) System.err.println(ME+"getDocTitle goes to false");
      }
      else if ( startTitle && tag.equals(startName) ) {
         if (DEBUG) System.err.println(ME+" end of " + startName );
	 startName = null;
	 if (! moreTitle ) {
	 startTitle = false;
         startAuthor = true;    // start looking for author(s)
	 //grabAuthor = true;     // also assume next line is an author
	 if (DEBUG)
	 System.err.println(ME+"startTitle goes false, startAuthor true");
	 // assert: title is some combination of what was in
	 // <title></title> and <H2></H2> or maybe <H2></H2>...<H3></H3>
	 // or maybe <H1></H1>...<H3></H3> or maybe <FONT size="+3></FONT>
	 }
      }
      else if ( startTitle && moreTitle && tag.equals(moreName) ) {
         moreName = null;
	 moreTitle = false;
	 startTitle = false;
	 startAuthor = true;
	 //grabAuthor = true;
	 if (DEBUG) System.err.println(ME
	    + "startTitle goes false, startAuthor and grabAuthor to true");
      }
      else if ( startAuthor ) as.handleEndTag( tag );
      else if ( doingReferences &&
	    (tag.equals ("p") || tag.equals ("li") || tag.equals("dd")) ) {
	    grabReference = false; // </P> ends a reference
	    rs.endText();
      }
      // Allow Context Section to stop scanning for multipe-tag section hdrs.
      else if ( doContexts || startAuthor ) cs.handleEndTag ( tag );

   }   // handleEndTag


   // handleText  -  Captures text fragments
   protected void handleText ( char[]  text, int offset, int length ) {
      if ( startAuthor ) cs.handleText();
      if (text.length==1 && text[0]=='\n') return;
      int i=0;
      while ( i < text.length && Character.isWhitespace(text[i] ) ) i++;
      if ( DEBUG && i == text.length )
      System.err.println(ME+"empty text");
      if ( i == text.length ) return;
      String textString =  new String ( text, offset, length ) ;
      if ( textString.replace('\n',' ').trim().equals("") ) return;
      if ( getDocTitle ) {
	 title = textString.replace('\n',' ').trim();
	 if (DEBUG)
	 System.err.println(ME+"title in header: "+title);
      }
      if ( startTitle ) {
	 title = resolveTitles ( title, textString.replace('\n',' ').trim() );
	 if (DEBUG)
	 System.err.println(ME+"title resolved to: " + title
	    + " (moreTitle is " + moreTitle + ")" );
      }

      // Assume each author is in a separate paragraph <p>author</p>
      // or possibly centered as in <center>author</br> or
      // <center> author [and institution] </center>
      // or multiple authors are in a single comma-separated line.
      // Turn startAuthor off when an <H3> tag is encountered
      // (unless h3 has been determined to start author name entries)
      // or here, if Abstract or Introduction is encountered
      // Character array is in text[offset:offset+length]
      // If this is the first author (i.e. v.size() = 0) then do not
      // turn off grabAuthor.  We really do want the first line of text
      // that is after the title.
      else if ( startAuthor ) {
	 if ( ! as.handleText( text, offset, length, textString ) ) {
            if (DEBUG) System.err.println(ME+
		  "grabAuthor, startAuthor set false, doContexts set true");
            startAuthor = false; doContexts = true; return;
         }
      }  // if startAuthor

      // While processing references assume each reference is delimited
      // by <P> and </P>.  Ignore tags of all sorts, but keep any URL
      // links that are found.
      else if ( grabReference || firstReference ) rs.addText(textString);

      // Look for a reference section only at headers, i.e. font changes
      else if (doContexts) {
	 if ( cs.isReference ( textString.trim() ) ) {
	    doingReferences = true; firstReference = true;
	    /* grabReference = true; bad, no </P> to turn it off, */
	    /* leaving entire appendices being gobbled up as references */
	    doContexts = false;
	    if (DEBUG) {
	       System.err.println(ME+"doContexts set to false");
	       System.err.println(ME+"doingReferences set to true");
	    }
	 }

      // just a paragraph of text.  Accumulate one sentence at a time.
         if (DEBUG) System.err.println(ME+"processing text");

         // process initial sentence (fragment) in textString
         int nextPos = 0;
	 while ( nextPos != -1 ) {
            nextPos = processText (textString, nextPos );
            // nextPos = -1 if we ran off end of textString looking for EOS
            if (nextPos != -1 ) {  // we have (we think) a sentence
	       // The following updates contextTrees as a side effect
	       if ( sentence.updateContextTrees(contextTrees) ) {
		  sentence = new SentenceTree ( sentence );
	       } else sentence.reset();
	       if (DEBUG) System.err.println (ME + "contextTrees has "
	       + contextTrees.size() + " elements.");
            }
	 }  

      } // if doContexts

   } // handleText


   // processText - called only if doContexts is true.
   /** splits text into sentences.  
    * @param String s is the chunk of text that is being processed.
    * @param nextPos (0-based) is where to pick
    * up looking for the end of the current sentence.
    * @returns the updated nextPos value.  Returns -1 if we are done with s.
    * Returns nextPos == s.length() if s ends with a sentence.
    * The updated nextPos value should be the position of the first
    * character of the next sentence in this textString.
    * Side effect is that the current sentence has been updated.
    */
   private int processText ( String s, int nextPos ) {

      if (DEBUG) System.err.println(ME+"in processText, nextPos = " + nextPos
	 + ", textString is " + s.length() + " characters long");


      // Find end of sentence in current text fragment, if any
      // (Actually, ind is the start of the next sentence, if >0)
      int ind = nextEOS ( s, nextPos );
      if (DEBUG) System.err.println(ME+"in processText, nextEOS = " + ind);

      // If start of first sentence, instantiate a new tree
      if ( sentence == null ) sentence = new SentenceTree();

      // Add sentence part to the sentence tree
      if ( ind == -1 ) {  // no end of sentence in final s fragment
	 sentence.addNode (s.substring(nextPos,s.length()));
      } else if ( ind == s.length() ) { // s ends with sentence
	 sentence.addNode (s.substring(nextPos,s.length()));
      } else {            // Have ". " etc. just before "ind"
         sentence.addNode(s.substring(nextPos,ind) );
      }
      if (DEBUG) System.err.println(ME+"in processText, returning " + ind);
      return ind;
   } // processText

   // nextEOS -
   /** finds the end of the sentence.
    * @param String s is the hunk of text currently being scanned.
    * @param integer n, position at which to start scanning.
    * Starting at String s, position n, return where the next sentence
    * starts, or -1 if you run off the end of the string while looking
    * for end of sentence, or s.length() if s ends with a sentence.
    */
   // TBD: See if we cannot deal strictly with trimmed and de-\n'ed strings
   private int nextEOS ( String s, int n ) {
      // Try full-stop
      int i = fullStop (s, n, '.');
      int j = fullStop (s,n,'?');
      if ( j == -1 || j > i ) j = fullStop (s, n, '!');
      //if (DEBUG) System.err.println(ME+"in nextEOS, i="+i+", j="+j);
      if ( j == -1 ) return i; else return j>i?i:j;
   }  // nextEOS

   // Look for stop-character followed by either EOL or at least one
   // space, followed by EOL or capital letter.  Keep scanning until
   // one such is found, or else return -1.  If full stop character
   // is followed immediately by a '[' or a '(' assume it is end of
   // sentence even though there was no blank
   private int fullStop ( String s, int n, char c) {
      int i = s.indexOf((int)c, n);
      char old_c = c;
      while ( i != -1 ) {
         if ( i == s.length()-1 ) return s.length();
         int j = i+1; c = s.charAt(j++);
	 if ( c == '[' || c == '(' ) return i+1;
         while ( Character.isWhitespace(c) && j < s.length()-1 )
	    c = s.charAt(j++);
         if ( j == s.length() ) return j; // sentence trailed with blanks
         // return position just after whitespace if upper case
         if ( ! Character.isLowerCase(c) ) return j-1;
	 // otherwise, look further.
	 i = s.indexOf((int)old_c, j);
      } // while looking for ".  C"
      return -1;
   }                     // fullstop

   // ==================================================
   // Required methods for the RefLinkAnalyzer interface
   // ==================================================

   // buildLocalMetaData -
   /** Return an XML file that contains original text fragments of
     * bibliographic information gleaned from this archive item.
     * As a side effect, since we now have the needed information and
     * since the XML file needs it anyway, synthesize a URN for this item.
     * Stick in own pubDate if one is not provided.
     */

   public String buildLocalMetaData( String doi, String pubDateIn, Creation c ) {
      if (DEBUG)
      System.err.println(ME+"in buildLocalMetaData");

      // set up the Author[] array
      authors = as.getAuthors();

      // Use caller's pubDate if one is defined
      if ( pubDateIn != null && !pubDateIn.equals("") ) pubDate = pubDateIn;

      // Synthesize a URN for this item
      if ( sourceURN == null )
        sourceURN = Creation.synthesizeURN (
         (authors==null?null:authors[0].getLastName()),
          pubDate, title );
      sourceURN = sourceURN.trim();

      // check in the creation database to see if there are any
      // merge of urn can be done
      if(sourceURN.indexOf('*') != -1) {
        // there is "*" in sourceURN
        String mergableCreation;
        if(authors != null) {
          mergableCreation = CreationDatabase.mergeCreation(
            sourceURN, Math.min(10, authors[0].getLastName().length()));
        }
        else {// the author part is "*"
          mergableCreation = CreationDatabase.mergeCreation(sourceURN, 0);
        }
        if(mergableCreation != null) {
          // merge the two urns
          sourceURN = mergableCreation;
        }
      }
      else { // there is no "*" in sourceURN
        Creation noAuthors, noDate, noTitle;
        String oldUrn;
        int authorLength = Math.min(10, authors[0].getLastName().length());
        if((noAuthors = CreationDatabase.fetchCreation(
              "*" + sourceURN.substring(authorLength))) != null) {
            // there is creation in Database can merge with sourceURN without authors
            noAuthors.setAuthors(authors);
            oldUrn = noAuthors.getURN();
            noAuthors.setUrn(sourceURN);
            CreationDatabase.update(sourceURN, oldUrn);
        }
        else if((noDate = CreationDatabase.fetchCreation(
                sourceURN.substring(0, authorLength) + "*" + sourceURN.substring(authorLength+4))) != null) {
            // there is creation in database can merge with sourceURN without date
            noDate.setDate(pubDate);
            oldUrn = noDate.getURN();
            noDate.setUrn(sourceURN);
            CreationDatabase.update(sourceURN, oldUrn);
        }
        else if((noTitle = CreationDatabase.fetchCreation(
                sourceURN.substring(0, authorLength+4) + "*")) != null) {
            // there is creation in database can merge with sourceURN without title
            noTitle.setTitle(title);
            oldUrn = noTitle.getURN();
            noTitle.setUrn(sourceURN);
            CreationDatabase.update(sourceURN, oldUrn);
        }
      }

      c.setUrn(sourceURN);
      c.setTitle(title);
      c.setAuthors(authors);
      CreationDatabase.save(sourceURN, doi);

      return GenXML.dc( title, pubDate, sourceURN, displayID, doi, authors, "");
   }

   /** buildRefList -
    * Return an array of Reference objects gleaned from this archive item.
    * As a side effect, also update the CiteRef database
    * No array element should be null, because it should have at least
    * the reference string.
    */

   public Reference[] buildRefList( BibData b ) {
      if (DEBUG)
      System.err.println(ME+"in buildRefList");
      if ( sourceURN == null ) sourceURN = b.getURN();

      if ( rs == null ) {
	 if (DEBUG)
	 System.err.println(ME+" no reference section found while " +
	 "parsing this document.  Returning null Reference list");
	 return null;
      } else return rs.buildRefList( sourceURN, contextTrees );
   }

   /** buildCitationList -
   * Return a vector of Citation objects currently known for this item
   * This will involved calls on the citeref database, which is
   * indexed by document URN.
   */
   public Vector buildCitationList( String docURN ) {
      if(DEBUG)
      System.err.println(ME+"in buildCitationList");

      // when building a Surrogate for a new work, need to check from the
      // citeRefDatabase to get all the citors that has cited this work already
      // and fill in the knownCitation field
      int authorLength = Math.min(10, authors[0].getLastName().length());
      if(CiteRefDatabase.findCiters("*" + sourceURN.substring(authorLength)) != null) {
          // there are incomplete-urn citers in CiteRefDatabase, missing authors
          CiteRefDatabase.update(sourceURN.trim(), "*" + sourceURN.substring(authorLength));
      }

      if(CiteRefDatabase.findCiters(
              sourceURN.substring(0, authorLength) + "*" + sourceURN.substring(authorLength+4)) != null) {
          // there are incomplete-urn citers in CiteRefDatabase, missing dates
          CiteRefDatabase.update(sourceURN.trim(),
              sourceURN.substring(0, authorLength) + "*" + sourceURN.substring(authorLength+4));
      }

      if(CiteRefDatabase.findCiters(sourceURN.substring(0, authorLength+4) + "*") != null) {
          // there are incomplete-urn citers in CiteRefDatabase, missing title
          CiteRefDatabase.update(sourceURN.trim(), sourceURN.substring(0, authorLength+4) + "*");
      }

      Vector vCiters = CiteRefDatabase.findCiters(sourceURN.trim());

      // sourceURN already exists, means somebody has cited sourceURN
      if (vCiters != null) {
         if (this.knownCitations == null) {
            this.knownCitations = new Vector();
            if(DEBUG)
              System.err.println("Find Citers!!!: " + vCiters.toString());
         }

         // for each citer, reconstruct its surrogate, double check
         // if sourceURN is in the reference list, then add this citer's citation
         // to the sourceURN's knownCitations vector and return the citation vector

         for (int i = 0; i < vCiters.size(); i++) {
            String[] citor = (String[])vCiters.elementAt(i);
            String urn = citor[0];
            String index = citor[1];
            Creation r = CreationDatabase.fetchCreation(urn);

           // String docID = CreationDatabase.getDOI(sourceURN);
           String docID = CreationDatabase.getDOI(urn);
            if(DEBUG)
              System.err.println("urn and doi" + urn + " " + docID);
            Surrogate sCiters = null;
            if ( docID != null) {
                String sDir = CONFIG.REPOSITORY + "/" + docID + "/Surrogate";
                sCiters = new Surrogate(sDir, urn, docID);
            }
            else {
		if (DEBUG)
                System.err.println("The WORK-" + urn + " does not exist!");
            }

            // double check if the sourceURN is in the reference list of this citer
            if (sCiters != null ) {
                Reference[] refList = sCiters.getReferences();
                for (int j = 0; j < Array.getLength(refList); j++) {
                    Creation c = refList[j].getDocID();

                    String cURN = c.getURN().trim();

                    if (cURN.compareToIgnoreCase(sourceURN.trim())==0) {
                        this.knownCitations.addElement(Surrogate.buildCitation(refList[j]));
                        if(DEBUG)
                          System.err.println("sourceURN is in the reference list");
                    }
                    else if(cURN.equals(("*" + sourceURN.substring(authorLength)).trim())){
                      // incomplete urn in /surrogate/refList file, need to rebuild
                      // the refList and overwrite the file with merged urn
                      // missing authors
                      refList[j].getDocID().setAuthors(authors);
                      refList[j].getDocID().setUrn(sourceURN.trim());
                      String result="<?xml version=\"1.0\"?>\n"
	                  + "<referencelist xmlns:dc=\"http://purl.org/DC\" length=\"";
                      if ( refList != null ) {
	                  result=result+refList.length+"\">\n";
                          for (int k=0; k < refList.length; k++)
	                    result = result + refList[k].toString() + "\n";
                      }
                      else result = result + "0\">\n";
                      result = result + "</referencelist>\n";
                      String filename = CONFIG.REPOSITORY + "/" + docID + "/Surrogate" + "/refList";
                      sCiters.saveData(filename, result.getBytes());
                      this.knownCitations.addElement(Surrogate.buildCitation(refList[j]));
                    }
                    else if(cURN.equals((sourceURN.substring(0, authorLength) + "*" + sourceURN.substring(authorLength+4)).trim())){
                      // missing dates
                      refList[j].getDocID().setDate(pubDate);
                      refList[j].getDocID().setUrn(sourceURN.trim());
                      String result="<?xml version=\"1.0\"?>\n"
	                  + "<referencelist xmlns:dc=\"http://purl.org/DC\" length=\"";
                      if ( refList != null ) {
	                  result=result+refList.length+"\">\n";
                          for (int k=0; k < refList.length; k++)
	                    result = result + refList[k].toString() + "\n";
                      }
                      else result = result + "0\">\n";
                      result = result + "</referencelist>\n";
                      String filename = CONFIG.REPOSITORY + "/" + docID + "/Surrogate" + "/refList";
                      sCiters.saveData(filename, result.getBytes());
                      this.knownCitations.addElement(Surrogate.buildCitation(refList[j]));
                    }
                    else if(cURN.equals((sourceURN.substring(0, authorLength+4) + "*").trim())) {
                      // missing title
                      refList[j].getDocID().setTitle(title);
                      refList[j].getDocID().setUrn(sourceURN.trim());
                      String result="<?xml version=\"1.0\"?>\n"
	                  + "<referencelist xmlns:dc=\"http://purl.org/DC\" length=\"";
                      if ( refList != null ) {
	                  result=result+refList.length+"\">\n";
                          for (int k=0; k < refList.length; k++)
	                    result = result + refList[k].toString() + "\n";
                      }
                      else result = result + "0\">\n";
                      result = result + "</referencelist>\n";
                      String filename = CONFIG.REPOSITORY + "/" + docID + "/Surrogate" + "/refList";
                      sCiters.saveData(filename, result.getBytes());
                      this.knownCitations.addElement(Surrogate.buildCitation(refList[j]));
                    }
                }
            }
            else {
                System.err.println(ME + "can not find the citer's surrogate: " + sourceURN);
            }
         }
         return this.knownCitations;
      }
      else {
        return null;
      }
   }

   // These entry points support getLinkedText

   /**
    * getLinkedTextInitialize sets up to generate XML for our Surrogate,
    * but not the <? xml version="1.0" ?> incantation.
    * @returns String to be added to the XML (including new lines)
    */
   public String getLinkedTextInitialize() {
      if ( DEBUG ) System.err.println (ME+" in getLinkedTextInitialize");
      return "";
   }

   /**
    * getLinkedText emits XML for the linked body of the text.
    * @param The array of Reference objects belonging to this Surrogate.
    * @param URL of the item being analyzed, for Base URL address
    * @throws SurrogateException if URL to be analyzed cannot be opened.
    * @returns String to be added to the XML (including new lines)
    */
   public String getLinkedText ( Reference[] refList, String url )
   throws SurrogateException {

    // NOTE: upon entry we already have a connection established to
    // localURL, and calling connect() is redundant; however this
    // connection is not in a state such that it can be used.
    // Besides, this routine might be called by a resurrected Surrogate,
    // and no analysis has been done by this analyzer

      // read in the document from its URL
      String document = readInputStream ( localURL );
      return RefLinker.getLinkedText( document, refList, url );
   }

   /**
    * getLinkedTextFinalize emits XML for finishing off the Surrogate
    * linked text output.  The main
    * use for this routine is to emit the linkage data elements for
    * documents that are not expressed in HTML or in XHTML.
    * @returns String to be added to the XML (including new lines)
    */
   public String getLinkedTextFinalize () {
      if ( DEBUG )
      System.err.println(ME+"in getLinkedTextFinalize");
      return "";
   }


// ==============  PRIVATE METHODS ============================

   // Connect to this url for analysis of an HTML document
   //private HttpURLConnection openURL ( String url ) {
   private URLConnection openURL ( String url )
   throws SurrogateException {
   //HttpURLConnection result = null;
   URLConnection result = null;
      try {
         //result = (HttpURLConnection)(new URL(url).openConnection());
         result = (URLConnection)(new URL(url).openConnection());
	 if (DEBUG)
	 System.err.println(ME+" connection to URL " + url );
	 return result;
      } catch ( IOException e ) {
	 throw new SurrogateException
	 (ME+" cannot open URL " + url + "!");
      }
   }

   // getDate - returns pubDate (possibly null)
   // is called by Surrotate in package Linkable.API
   public String getDate() { return pubDate; }

   // Open this URL Connection - public because could be called by Surrogate
   public InputStreamReader openConn ( URLConnection conn )
   throws SurrogateException {
   InputStream instream = null;
      try {
         instream = conn.getInputStream();
	 if ( DEBUG )
         System.err.println(ME+" got input stream ");
         return new InputStreamReader ( instream );
      } catch ( IOException e ) {
	 throw new SurrogateException (
	 "The xhtml parser cannot open the url, " +
	 (conn.getURL()).toString() );
      }
   }

   // Create a buffered reader
   private BufferedReader buffer ( InputStreamReader in, int k ) {
      try {
         return ( new BufferedReader ( in, k ) );
      } catch ( IllegalArgumentException e ) {
         System.err.println(ME+"cannot open a buffered reader with "
         + " a buffer of size " + k);
         return ( new BufferedReader ( in ) );
      }
   }

   // Read the HTML (blocking)
   private String readInputStream ( String u ) throws SurrogateException {

      // open the URL connection and put a buffered reader on it
      URLConnection connection = openURL( u );
      InputStreamReader in = openConn ( connection );
      int k = connection.getContentLength();
      BufferedReader br = buffer ( in, k );

      // read the document into a char[] array
      char[] result = new char[k];
      int offset = 0;
      int i = 0;
      try {
         while ( offset+1 < k &&
            (i = br.read ( result, offset, k-offset )) != -1 ) offset += i;
         br.close();
      } catch (IOException e) {
         System.err.println(ME+"get IOException while trying to read in"
         + " from url " + u + "\n" + e.toString() );
      } catch (Exception e) {
         System.err.println(ME+"caught exception while trying to read in"
         + " from url " + u + "\n" + e.toString() );
      }

      return new String(result);
   }

   /*
   // mapReferences -
   */
   /** returns XLink elements or null for each Reference in the list
   * note that XLink elements may contain multiple URLs
   * They each contain "****" where the anchor (the reference in text)
   * is supposed to go.
   * @param The array of References to be process
   * @returns link elements for each
   */
   /*
   private String[] mapReferences ( Reference[] refList ) {
      String[] result = new String[refList.length];
      for ( int i=0; i<refList.length; i++ ) {
	 Creation docID = refList[i].getDocID();
         String[] urlList = docID.getURLs();
         // urlList is null if item has no display ID
         if ( urlList == null ) result[i] = null;
         else {
            String anchor = "****";   // just a placeholder
            result[i] = GenXML.urlList2xlink ( i,
	    refList[i].getLiteral(),
	    (docID.getAuthors()[0]).getLastName(),
	    docID.getTitle(), docID.getDate(),urlList, anchor );
         }
      }
      return result;
   }
   */


   /*
   // return the tag elements for each Reference in the list.
   // Tags are stripped of [](). characters.
   private String[] getTags ( Reference[] refList ) {
      String[] result = new String[refList.length];
      for ( int i=0; i<refList.length; i++ )
         result[i] = refList[i].getTag();
      return result;
   }
   */

   // Find out which reference this is, and if it has a URL return the xLink
   // with the actual ref substituted for the "****"
   private String refHasURL(String[] xLinks, String[] tags, String ref) {
   System.err.println(ME+"in refHasURL for ref ->" + ref);
      for (int i=0; i<tags.length; i++)
         if ( ref.equals ( tags[i] ) ) {
            String xlink = xLinks[i];
   System.err.println(ME+"in refHasURL got xlink ->" + xlink );
            if ( xlink != null ) {
               int j = xlink.indexOf("****");
               return xlink.substring(0,j)
                   + ref + xlink.substring(j+4,xlink.length());
            } else return "";   // this is the ref, but no xlink
         }
      return "";
   }

   // document: current version of the document being analyzed
   // position: where in the document the current context is
   // tags: reference list tags for this document
   // xLinks: xlinks for each reference for this document
   // sentence: the current context
   private String update ( String document, int position, String[] tags,
      String[] xLinks, SentenceTree sentence ) {
      StringTokenizer st =
         new StringTokenizer(sentence.getTags(),"[]().-,", true);

      // Find each ref in this context
      while ( st.hasMoreTokens() ) {
         String token = (st.nextToken()).trim();
System.err.println(ME+"next token ->"+token);
         if ( token.equals("[") || token.equals("(") || token.equals(",") ) {

            // update position in document - get index of "["
            position = document.indexOf(token,position);

            // get the next token string in this context
            String ref = (st.nextToken()).trim();
System.err.println(ME+"ref->"+token);
            position = document.indexOf (ref, position);
            String terminator = st.nextToken();
System.err.println(ME+"terminator->"+token);

            // skip references of the form [number-number].  Future
            // implementation TBD.
            if ( terminator.equals("-")) {
               st.nextToken(); st.nextToken(); // get ref and ']'
            }
            else {
               String xlink = refHasURL ( xLinks, tags, ref );
System.err.println(ME+"back from refHasURL with xlink ->"+xlink);
               if ( ! xlink.equals( "" ) ) {
                  // This truly does match a reference tag, with an xLink
                  int[] k = findLocalLink(document,position,token);
                  document = document.substring(0,k[0]) + xlink +
                     document.substring(k[1]+1, document.length() );
                  position = k[1]+1;
               }
            }
         } // if "[" or "("
      } // end of tokens

      return document;
   }

   // We have an anchor at position "p" in document "doc"
   // If this is an anchor for a local link, return indices of
   // beginning and end of local link.  Otherwise just return
   // position of the reference "r"
   // (NOTE: this still needs LOTS of work)
   private int[] findLocalLink (String doc, int p, String ref) {
      int[] result = new int[2];    // beginning, end
      result[0] = p;
      result[1] = p+ref.length()-1;
      return result;
   }

   private boolean isAnH ( String h ) {
      return
	 h.equals("h1") ||
	 h.equals("h2") ||
	 h.equals("h3") ||
	 h.equals("h4") ||
	 h.equals("h5") ||
	 h.equals("h6") ;
   }

   // Called when looking for titles and <FONT> element is encountered.
   // If there is an attribute like size="+3" take this equivalent to <H1>
   private boolean isSizeChange (AttributeList attrs) {
      if (attrs == null) return false;
      String value = attrs.getValue("size");
      if ( DEBUG )
      System.err.println(ME+"in isSizeChange() found size="+value);
      if (value == null) return false;
      if ( value.equals("+3") ) return true;
      if ( value.equals("5") ) return true;
      if ( value.equals("+2") ) return true;
      return false;
   }

   // One title is from the metadata at the start of the document,
   // the other title is in the text itslef.  lowerCase compare
   private String resolveTitles (String title, String textString) {
         if ( title == null ) {
            System.err.println(ME+"set title: " + textString );
            return textString;
         }
         else { // compare with existing title
            if ( title.equalsIgnoreCase ( textString ) ) {
               System.err.println(ME+"definite title: "
               + title );
	    return textString;
            }
         }
	 if (title.length() < textString.length())return textString;
	 // handle case of title with colon spread over 2 lines
	 int k = title.indexOf(":");
	 if ( k == -1 ) k = title.indexOf(".");  // sometimes a "."
	 if (DEBUG)
	 System.err.println(ME+"title not shorter than textString, and "
	    + "search for ':'|'.' yielded " + k);
	 if (k == -1) return title;
	 String comp1 = title.toLowerCase();
	 String comp2 = textString.toLowerCase();
	 if (DEBUG)
	 System.err.println(ME+"title starts with "
	    + textString + ": "
	    + comp1.startsWith(comp2.replace('\n',' ').trim()) );
	 if ( comp1.startsWith(comp2.replace('\n',' ').trim()) )
	    moreTitle = true;
	 // a more stringent "starts with" test would ignore special characters
	 // I.e. "<title>Global-Info... would match <h2>Global Info...
	 // could check that textString ends title, if moreTitle, but
	 // we want to leave moreTitle set for when </h3> is found.
         return title;
   }


}
