// bergmark - june 2000 - reference linking project

package Linkable.Analysis;

// SentenceTree is an Analysis Helper class that keeps around a little tree
// that corresponds to a single sentence being analyzed.  It can handle just
// straight ASCII as well as markup language (XML, HTML).  This is to aid
// analyzers that read in a sentence in chunks.

// For generality, nodes have types: text, tag, and attribute
// attribute nodes have a type and a value

// One assertion about the SentenceTree object is that its textual
// value does not have any leading whitespace.

// The hint: how to parse for reference tags in free text.
// Either the hint is specified at constructor time, or else
// it is deduced by seeing which pattern the references in this document
// seem to match.  That is, both parsers are applied and the more successful
// one wins.

// After the first handle (reference tag) has been found for a document, the
// same general format will obtain for the remaining reference tags in that
// same document.  Hence constructors are called with the previous sentence
// which, if not null, contains the hint that is currently in effect.

// It would simplify this routine to simply maintain a String Buffer,
// not a whole tree, but the RefLinkAnalyzers might want to compare
// texts with sentences fragment by fragment.  For example, XHTMLAnalyzer
// matches the first hunk of each sentence with the document.  One could
// simplify by using a linked list instead of a tree.

// updates:
// 2000-10-24:  Enable brackets around semi-coloned names and years
//              Do away with addSon and terminateCurrentTree
//              Fix isNameAndYear to require year with ')'
// 2000-10-31:  squareRefs should return [3][4] rather than [3, 4]
//              Also [1][2] rather than [1-2]
// 2000-11-02:  Add s helper routine, endsWith()
// 2000-11-22:  Distinguish between anchors (which are text fragments)
//              and normalized anchors (formally known as anchors)
//              Eliminate "contextNumber"
// 2000-12-01:  parensref will not be able to override hint from squareref
// 2000-12-04:  bracketsRef calls enclosedAcronym
// 2000-12-06:  pull apart strung-together references 
// 2000-12-07:  fix yesterday's bugs, finish up Split
// 2001-06-04:  Add code back into "isNameAndYear" that accidentially got
//              deleted last September.
// 2001-06-12:  Fixed bug in loneYear where anchors was used before allocation
// 2001-06-13:  Try to clean up the reference anchors a bit more.
// 2001-07-23:  Crashes on [17, p.73].  This should be parsed into [17].
// 2001-07-27:  Add code to handle the case "[1]-[4]"
// 2001-08-01:  Added append, beginsWith, and getRoot to facilite handling
//              premature contexts better
// 2001-08-07:  Moved updateContextTrees here from XHTMLAnalyzer

import java.util.Vector;
import java.util.StringTokenizer;
import java.util.NoSuchElementException;
import java.util.Enumeration;

import Linkable.Utility.CONFIG;

import uk.ac.soton.harvester.Utils;

/**
 * The SentenceTree class encapsulates one sentence from the text being analyzed.
 * It contains the text (as a tree of nodes), the references that were found in
 * this sentence, and the actual anchor strings corresponding to the references.
 */
public class SentenceTree {

   private final static String ME = "SentenceTree: ";
   private static final boolean DEBUG = CONFIG.DEBUG;

      private Node root, currentTree = null;
      // Note: currentTree.next is always null
      // If currentTree is null, then first addNode starts the new tree

      protected int hint = -1;  // same as unknown

      private static final int TEXT = 0;
      private static final int TAG = 1;
      private static final int HREF = 2;
      private static final int ENDTAG = 3;

      protected static final int SQUARE_BRACKETS_AROUND_NUMERALS = 0;
      protected static final int PARENTHESES_AROUND_NAMES_AND_YEAR = 1;
      protected static final int SQUARE_BRACKETS_AROUND_ACRONYMS = 2;
      protected static final int PARENTHESES_AROUND_COMMAED_NAMES_AND_YEARS = 3;
      protected static final int BRACKETS_AROUND_COMMAED_NAMES_AND_YEARS = 4;
      protected static final int CURLY_BRACKETS_AROUND_ACRONYMS = 5;

      // refsInText and anchors are protected but will only be used by
      // another SentenceTree object (via the append operation), and
      // so the representation does not have to be encapsulated.
      protected Vector refsInText = null;   // reference tags found in tree

      // text fragments that correspond to refs
      protected Vector anchors = null; 


   // In the following "<nA>" denotes "not uppercase" character.  These are
   // strings that a sentence might stop with, which if followed by
   // the start string, should not be a stop.  "e.g" omitted because
   // so many sentences end with "e."
   private static String[] FALSESTOPS = {"et al.", "etc.", "et.",
      "i.", "<digit>.", "e.g.", "i.e." };
   private static String[] FALSESTARTS= {"<nA>",    "<a>",  "al.",
      "e.", "<digit>", "", "" };


   //============   PROTECTED ROUTINES USED IN ANALYSIS PACKAGE ==========

   /**
    * Constructor that just makes a new tree
    */
   protected SentenceTree () {
      if (DEBUG) System.err.println(ME+"instantiated");
   }

   /**
     * Or you can construct a tree with the hint already set from previous
     * sentences in this text.
     * @param the Sentence Tree that has references in it
     */
   protected SentenceTree (SentenceTree st) {
      if (st != null) {
	 this.hint = st.hint;
         if (DEBUG) System.err.println(ME+"instantiated with hint = " + hint);
      }
      else if (DEBUG) System.err.println(ME+"instantiated with no hint");
   }

   /**
     * Or you can construct a tree with a specific hint in it.
     * @param - the integer hint
     */
   protected SentenceTree (int hint) {
      this.hint = hint;
      if (DEBUG) System.err.println(ME+"instantiated with hint = " + hint);
   }

   // reset -
   /**
    * Since the style of references does not change over the course of
    * a document it is good to keep the hint in place from one instantiation
    * to the next.  Everything else gets reset, which makes this tree act
    * as if it had only now been instantiated. 
    */
   protected void reset () {
      root = null;
      currentTree = null; 
      refsInText = null;
      anchors = null;
   }

   // getHint -
   /** returns the hint currently in use for this SentenceTree
   */
   protected int getHint() { return hint; }


   // addNode -
   /** adds another node to current subtree.
    * @param The textual content of the node to be added
    */
   protected void addNode ( String content ) {
       if ( currentTree == null ) {
	  currentTree = new Node(content.trim(), TEXT, null, null, null);
	  root = currentTree;
       } else {
          currentTree.next = new Node 
	     (content, TEXT, currentTree.parent, null, null);
          currentTree = currentTree.next;
       }
       if (DEBUG) System.err.println(ME+"after adding new node: -->" 
	   + this.dump() + "<--");
   }

   // append -
   /** adds another SentenceTree object to the end of this one
    * @param a sentence tree whose textual value is to be appended
    * to the textual value of this sentence tree
    */
   protected void append ( SentenceTree x ) {
      currentTree.next = x.getRoot(); currentTree = currentTree.next;
      if ( DEBUG ) System.err.println(ME+"after appending another tree: -->"
	 + this.dump() + "<--");
      // also update the instance data
      if (x.refsInText != null) {
         if (DEBUG) System.err.println(ME+"append merging refsInText...");
	 if ( refsInText == null ) refsInText = new Vector();
         for (int i=0; i<x.refsInText.size();i++) 
	    refsInText.addElement((String)(x.refsInText).elementAt(i));
         if (DEBUG) System.err.println(ME+"new refsInText: "+refsInText+
         "\nmerging anchors...");
	 if (anchors==null) anchors = new Vector();
         for (int i=0; i<x.anchors.size();i++) 
	    anchors.addElement((String)(x.anchors).elementAt(i));
         if (DEBUG) System.err.println(ME+"new anchors: "+anchors);
      }
      hasReferences = hasReferences || x.hasReferences;
      whichFALSESTOP = x.whichFALSESTOP;
      if (DEBUG) System.err.println(ME+"combined tree hasReferences: "
      + hasReferences + " whichFALSESTOP= " + whichFALSESTOP);
   }

   // getRoot -
   /** digs out the node that is at the top of the tree
    * @returns the Node that is the root of this tree
    */
    protected Node getRoot() {
       return root;
   }

   // findReferences -
   /** Looks for references in this Sentence Tree and stores them
     * in a private structure that contains all the info on these links.
     * @returns true if this sentence appears to contain some references.
     */
   protected boolean findReferences ( ) {
      if (DEBUG) System.err.print(ME+"in findReferences with hint "+hint);
      switch ( hint ) {
      case SQUARE_BRACKETS_AROUND_NUMERALS:
	 refsInText = squareRef( );
	 if (DEBUG) System.err.println("SQUARE_BRACKETS_AROUND_NUMERALS");
	 break;
      case PARENTHESES_AROUND_NAMES_AND_YEAR:
	 refsInText = parensRef( );
	 if (DEBUG) System.err.println("PARENTHESES_AROUND_NAMES_AND_YEAR");
	 break;
      case SQUARE_BRACKETS_AROUND_ACRONYMS:
	 refsInText = squareAcronym( );
	 if (DEBUG) System.err.println("SQUARE_BRACKETS_AROUND_ACRONYMS");
	 break;
      case PARENTHESES_AROUND_COMMAED_NAMES_AND_YEARS:
	 refsInText = parensRef( );
	 if (DEBUG) System.err.println(
	      "PARENTHESES_AROUND_COMMAED_NAMES_AND_YEARS");
	 break;
      case BRACKETS_AROUND_COMMAED_NAMES_AND_YEARS:
	 refsInText = bracketsRef( );
	 if (DEBUG) System.err.println(
		 "BRACKETS_AROUND_COMMAED_NAMES_AND_YEARS");
	 break;
      case CURLY_BRACKETS_AROUND_ACRONYMS:
	 if (DEBUG) System.err.println("CURLY_BRACKETS_AROUND_ACRONYMS");
	 refsInText = curlyAcronym( );
	 break;
      default:
	 if ( DEBUG ) System.err.println("-1");
	 Vector v = squareRef( );
	    int savedHint = hint;
            Vector savedAnchors = anchors;
            anchors = null;
	 Vector w = parensRef( );
	 int vSize = 0; int wSize = 0;
	 if ( v != null ) vSize = v.size();
	 if ( w != null ) wSize = w.size();
	 if ( vSize >= wSize ) {
	    hint = savedHint;
            anchors = savedAnchors;
	    refsInText = v;
	 } else { 
	    refsInText = w;
	 }
         if ( refsInText == null ) refsInText = curlyAcronym( );
      }
      return refsInText != null;
   }

   // endsWith -
   /** A helper routine to check whether this really is the end of a sentence.
    * @param A string array of words to check (e.g. etc.)
    * @returns index of a falsestop if this sentence ends with one of 
    * the arguments, else returns -1.
    * Note: used by XHTMLAnalyzer and possibly other analysis routines.
    */
   protected int endsWith ( String[] words ) {
      String sentence = root.text().replace('\n',' ').trim();
      for ( int i=0; i<words.length; i++)
	 if ( words[i].equals("<digit>.") &&
	    sentence.length() > 1 ) {
	    if ( Character.isDigit(sentence.charAt(sentence.length()-2)) &&
            sentence.endsWith (".")) return i;
	 } else if ( sentence.endsWith ( words[i] ) )
	    return i;
      return -1;
   }

   // beginsWith -
   /** A helper routine to check whether this begins with a false
    *  start.  Called only if this sentence follows one that ended
    *  with a false start.
    * @param - the string indicating what a false start would be
    * @returns true if this sentence begins with the indicated
    * string (after optional whitespace).
    */
    protected boolean beginsWith ( String fs ) {
       String text = root.text();
       if (DEBUG) System.err.println(ME+"beginsWith ["+fs+"]->"+text+"<-");
       if ( fs.equals ("") )  return true;
       else if ( fs.equals ("<digit>") ) {
	  return Character.isDigit(text.charAt(0));
       } else if ( fs.equals ("<nA>") ) {
	  return !Character.isUpperCase(text.charAt(0));
       }
       return text.indexOf(fs) != -1;
    }

   // print out the tree, in preorder 
   protected String dump() {
      if ( root != null ) return root.dump();
      else return "";
   }

   // print out only the text of the tree, in preorder
   protected String text() {
      if ( root != null ) return root.text();
      else return "";
   }

   // getLinks -
   /** print out the table of potential links
     * @returns string that looks like "References in this context: [1][8]"
     */
   protected String getLinks() {
      String result = "References in this context: ";
      if (refsInText == null ) return result+"(none)\n";
      else result += getTags();
      return result + "\n";
   }

   // getAnchors -
   /** Returns the String array of anchors corresponding to potential links
     * @returns string[] that contains the string corresponding to each link,
     * null if there are no links in this context.
     */
   protected String[] getAnchors() {
      if (anchors == null ) return null;
      String[] result = new String[anchors.size()];
      for ( int i=0; i<result.length; i++ )
	 result[i] = (String)anchors.elementAt(i);
      return result;
   }

   // getTags -
   /** helper function to get the list of tags just as a string
     * called by getLinks and by XHTMLAnalyzer.  
     *@returns a String of tags, like "[1][2]"  */
   protected String getTags() {
      String result = "";
      String reference=null;
      if ( refsInText == null ) return result;
      for ( int i = 0; i < refsInText.size(); i++ ) {
	 result += (String)refsInText.elementAt(i);
      }
      return result;
   }


   // =============  PRIVATE ROUTINES ==================================

   // squareRef -
   /** recognizes references in the form [...]
   *@return reference tags as strings, or null if none found, with
   * each reference is enclosed in square brackets.
   * Formats handled: [1,2,3], [5], [5-10], [17,p.73], [1]-[4]
   * Note: as a side effect, fill in the "anchors" vector with the
   * anchor string associated with each normalized reference. */

   private Vector squareRef( ) {

   // accept tags of the form: '[' <number> [ ',' <number> ] ']'
   // or '[' <digits> ']' '-' '[' <digits> ']'
   // where <number> can be <digits> or <digits> '-' <digits>

      if (DEBUG) System.err.println(ME+"in squareRef");

      boolean findReference = true;
      boolean findRefNumber = false;
      boolean growNumber = false;
      boolean discardPage = false;
      boolean lookForDash = false;
      boolean lookForBracket = false;
      String context = root.text();

      String v;                // value of a text node
      String tag="";           // reference tag, if one is found
      int i = 0;               // index into v (0...v.length-1)
      Vector result = null;    // all the references in this context
      StringBuffer sb = null;
      StringBuffer anchorBuf = null;  // build up anchor in here

      if ( (i = context.indexOf("[")) == -1 ) return null;
      context = context.replace('\n',' ');
      String token;
      StringTokenizer st = new StringTokenizer ( context.substring(i),
	 "[],- ", true);

      while ( st.hasMoreTokens() ) {
      try {

	 token = st.nextToken();

         if (DEBUG) System.err.println(ME+
	     " token is \"" + token + "\"" + 
	     ", buffer is <" + (sb==null?"null":sb.toString()) + ">" );

	 // Look for '['.  If we run out of tokens, we are done.
	 if ( findReference ) {           // look for '['
	    while ( !token.equals("[") ) token = st.nextToken();
	    sb = new StringBuffer(); sb.append("[");
	    anchorBuf = new StringBuffer();  // no "[" into anchor
	    findReference = false; findRefNumber = true;
	    if ( DEBUG )
	    System.err.println(ME+"findReference false, findRefNumber true");

         // Look for a number.  Call bracketsRef if '[' is followed
         // by anything except a " " or a digit and the result 
         // vector is null (i.e. this is the first "[") and if
         // the hint is -1 (i.e. this is the first tag in document)
	 // Otherwise if hint is set and we get some bogus "[the]" in
	 // the context, start over again.
	 } else if ( findRefNumber ) {      
	    if ( isNumeric ( token ) ) {
	       sb.append( token ); anchorBuf.append( token );
               growNumber = true; findRefNumber = false;
	       if (DEBUG)
	       System.err.println(ME+"findRefNumber false, growNumber true");
	    } 
	    else if ( token.equals(" ") ) {
	       sb.append( token );
	       anchorBuf.append( token );
	    }
	    else if ( token.equals("p.") ) {     // scan over page number
	       if ( DEBUG ) {
		  System.out.println (ME+"discarding page number");
		  System.err.println(ME+"growNumber false, doscardPage true");
	       }
	       growNumber = false; discardPage = true;
	    }
            else if ( ( hint == -1 ) && ( result == null ) )
	         return bracketsRef ( );
	    else {
	       if ( DEBUG )
	       System.err.println(ME+"throwing away bogus reference");
	       findReference = true; findRefNumber = false;
	    }
         }

	 // We got a number.  Scan until ']', ',' or '-' or out of tokens.
	 // If we run out of tokens, throw away current string buffer.
	 // If we get a "]" delay finishing up until we know we do not
	 // have a [<digits>]-[<digits>] situation.
	 else if ( growNumber ) {         
	    if ( token.equals ( "]" ) ) {
	       growNumber = false; lookForDash = true;
	       System.err.println(ME+"growNumber false, lookForDash true");
	    } else if ( token.equals ( "," ) || token.equals( "-" ) ) {
	       sb.append(token); anchorBuf.append(token);
	       growNumber = false; findRefNumber = true;
	       System.err.println(ME+"growNumber false, findRefNumber true");
	    } else if ( token.equals ( " " )) {
                 sb.append(token); anchorBuf.append(token);
            } else {
	       growNumber = false; findReference = true; 
	       System.err.println(ME+"growNumber false, findReference true");
	    }
	 } 

	 // We had gotten a "[<digits>]".  Look for a "-"
	 else if ( lookForDash ) {
	    if ( token.equals("-") ) {
	       lookForDash = false; lookForBracket = true;
	       if (DEBUG)
	       System.err.println(ME+"lookForDash false, lookForBracket true");
	    } else if ( token != " " ) {   
	       // Process the "[<digits>]" part
	       sb.append("]");
	       if ( result == null ) {
                  result = new Vector();
                  anchors = new Vector();
               }
	       result.addElement (expand(sb.toString(), anchorBuf.toString()));
	       lookForDash = false; findReference = true;
	       if (DEBUG)
	       System.err.println(ME+"lookForDash false, findReference true");
	    }
	 }

	 // We had gotten a "[<digits>]-".  Look for a "[".
	 // sb contains "[<digits>"   anchorBuf contains "<digits>"
	 else if ( lookForBracket ) {
	    if ( token.equals("[") ) {
	       // Process the "[<digits>]-" part
	       sb.append("-"); anchorBuf.append("-");
	       lookForBracket = false; findRefNumber = true;
	       if ( DEBUG )
	       System.err.println(ME+"lookForBracket false, findRefNumber true");
	    } else {            // Did not get the "[" we were looking for
	       if ( ! token.equals(" ") ) {
	       // Process the "[<digits>]" part
	          sb.append("]");
	          if ( result == null ) {
                     result = new Vector();
                     anchors = new Vector();
                  }
	          result.addElement 
		     (expand(sb.toString(), anchorBuf.toString()));
	          lookForBracket = false; findReference = true;
		  if ( DEBUG )
	          System.err.println(ME+"lookForBracket false, findReference true");
	       }
	    }
	 }

	 // We had gotten "p.".  Discard tokens until we reach a "]"
	 // or a ","
	 else if ( discardPage ) {
	    if ( token == "," ) {
	       sb.append(token); anchorBuf.append(token);
	       discardPage = false; findRefNumber = true;
	       if ( DEBUG )
	       System.err.println(ME+"discardPage false, findRefNumber true");
	    } 
            else if ( token == "]" ) {
	       sb.append(token);
	       if ( result == null ) {
                  result = new Vector();
                  anchors = new Vector();
               }
	       result.addElement (expand(sb.toString(), anchorBuf.toString()));
	       discardPage = false; findReference = true;
	       System.err.println(ME+"discardPage false, findReference true");
	    }
	 }

      } catch (NoSuchElementException e) { break; } // no more tokens
      } // end while more tokens

      if ( DEBUG ) {
         System.err.println(ME+" ran out of tokens");
	 System.err.println(ME+"squareRef returning a vector of "
	 + (result==null?0:result.size()) + " elements, "
         + " current size of anchors Vector is " 
	 + (anchors==null?0:anchors.size()) );
      }
      if ( result != null ) hint = SQUARE_BRACKETS_AROUND_NUMERALS;
      return result;

   } // squareRef

   // parensRef -
   /** recognizes references of the form "(... year)".
   * (Besser,1994,Cringley,1996)
   * (Jones et al.,1999)  (Jones & Smith, 1999)
   * (Institution name, 1999) (Alvin,1998,Bailey,1999)
   * (Evans et al. 1989a; Jones 1991)   Bray (1997)
   * (Hitchcock et al. 1996, 1997a) (TBD)
   * If one of these formats was found and hint is -1, reset
   * hint to PARENTHESES_AROUND_COMMAED_NAMES_AND_YEARS.
   * @returns A vector containing each recognized element,
   * enclosed in square brackets.
   * Note: as a side effect, the anchors Vector is filled in */

   private Vector parensRef( ) {
      if (DEBUG) System.err.println(ME+"in parensRef");
      Vector result = null;  // all the references in this context
      StringBuffer sb = null;
      StringBuffer anchorBuf = null;

      boolean findReference = true;  // ( <element> [,<element>]* )
      boolean findElement = false;   // <names>[et al.] , <year>
      boolean findYear = false;

      String context = root.text();
      int i = context.indexOf( '(' );
      if ( i == -1 ) return null;
      context = context.replace('\n',' ');
      StringTokenizer st = new StringTokenizer
	       (context.substring(i), "(),; ", true);
      String token;
      Vector tokens=null;

      if ( DEBUG ) {
	 System.err.println(ME+"examining a sentence that contains a '(':\n"
	 + context);
      }

      while ( st.hasMoreTokens() ) {

      try {

	 token = st.nextToken(); 

	 if ( findReference ) {            // look for "("
	    while ( !token.equals("(") && st.hasMoreTokens() )
	       token = st.nextToken();
	    sb = new StringBuffer(); sb.append("(");
	    tokens = new Vector ();
	    tokens.addElement (token);
            anchorBuf = new StringBuffer();
	    findReference = false; findElement = true; //i++;
	    if (DEBUG) System.err.println(ME+"parensRef has '('");

	 // findElement is true after opening "(" or after "<element>," 
	 } else if ( findElement ) {
	    sb.append(token);  // name
            anchorBuf.append(token);
	    String element = isNameAndYear ( st, tokens, ")" );
	    /* either we are out of tokens or we have a "," */
	    if ( !st.hasMoreTokens() ) break;
	    if ( element == null ) {  
	       if ( isYear ( token ) ) { // is a lone year e.g. (1998)
		  String anchor = loneYear ( context, i, token.length() );
		  if ( anchor == null ) {
		     findElement = false; findReference = true;
		  } else {  // reset sb to complete anchor
		     sb.replace(1,sb.length(),
			anchor.substring(1,anchor.length()-1) + ")" );
		     if ( result == null ) result = new Vector();
		     result.addElement ( cleanup(sb.toString()) );
	             if (DEBUG) System.err.println(ME+"parensRef has element" 
	             + ".  Buffer is: " + sb.toString() );
		     findReference = true; findElement = false;
		  }
	       } else { // discard this false start
	          findElement = false; findReference = true;
	       }
	    } else {
	       sb.append( element );  
               anchorBuf.append( element );
               if ( result == null ) result = new Vector();
               result.addElement ( cleanup(sb.toString()) );
               anchorBuf.delete ( anchorBuf.length()-1, anchorBuf.length());
	       if ( anchors == null ) anchors = new Vector();
               anchors.addElement ( anchorBuf.toString() );
	       if (DEBUG) System.err.println(ME+"parensRef has element" 
	          + ".  Buffer is: " + sb.toString() + ". anchor is " + 
                  anchorBuf.toString() );
	
               // See what character terminated the <element>
               if ( sb.charAt ( sb.length()-1 ) == ')' ) findReference = true;
	       else {sb = new StringBuffer(); anchorBuf = new StringBuffer();}
            }
	 }  // findElement

      } catch (NoSuchElementException e) { break; } // no more tokens 

      }  // end while there are more tokens
      if ( DEBUG )
      System.err.println(ME+"parensRef is out of tokens");
      if ( result != null ) hint = PARENTHESES_AROUND_COMMAED_NAMES_AND_YEARS;
      if (DEBUG) System.err.println(ME+"parensRef returning a vector of " 
	 + (result==null?0:result.size()) + " elements");
      return result;
   } // parensRef

   // curlyAcronym -
   /** Return a Vector of all the tags found in this context.
   * Recognize things like {FOOBAR} or {ONE, TWO} or {ONE,TWO}
   * Note: this routine is called only if (1) {refs} were found before
   * or (2) hint is -1 
   */
   private Vector curlyAcronym( ) {
      return enclosedAcronym (
	 "curlyAcronym",                  // name of caller
	 "{}, ",                          // the delimiters and tokens
	 false,                           // do not need a year
	 CURLY_BRACKETS_AROUND_ACRONYMS); // hint to be set
   }

   // enclosedAcronym -
   /** Return a vector of all the tags found in this context.  The tags
   * are expected to be in the form of <left><acronym><right>
   * where left,right = [] or {} or ().  The acronym could also be a
   * commaed list of acronyms.  needYear determines whether or not
   * the commaed list should have alternating acronyms and years.
   *
   * The brackets are the first two characters in "delims".  The
   * remaining characters are additional parsing tokens.
   *
   * Side effect: if at least one bracketed reference is found in
   * this context, then reset the SentenceTree hint to "hintName".
   *
   * Side effect(TBD): if BRACKETS_AROUND_COMMAED_NAMES_AND_YEARS and
   * there are no years in the context which does, however contain
   * "[...]", reset hint to SQUARE_BRACKETS_AROUND_ACRONYMS
   */

   private Vector enclosedAcronym ( String caller, String delims, 
	  boolean needYear, int hintName ) {

      Vector result = null;           // all the references in this context
      //StringBuffer sb = null;         // accumulates one enclosed reference
      //StringBuffer anchorBuf = null;  // accumulates actual anchor strings
      boolean findReference = true;   // states of our finite state machine
      boolean findElement = false;
      String context = root.text();   // the context string to be examined
      char left = delims.charAt(0);   // left, right delimiters in this paper
      char right = delims.charAt(1);
      String sleft = delims.substring(0,1);
      String sright = delims.substring(1,2);
      String token, element;          // for parsing this context
      Vector tokens=null;             // accumulates tokens in one reference

      int i = context.indexOf(left);  // temporary
      if ( i == -1 ) return null;

      context = context.replace('\n',' ');
      StringTokenizer st = 
	 new StringTokenizer( context.substring(i), delims, true);

      if ( DEBUG ) {
	 System.err.println(ME+"examining a sentence that contains a " +
	 sleft + ":\n" + context);
      }

      while ( st.hasMoreTokens() ) {

      try {

	 token = st.nextToken(); 

	 if ( findReference ) {            // look for left delimiter
	    while ( !token.equals(sleft) ) //&& st.hasMoreTokens() )
	       token = st.nextToken();
	    tokens = new Vector(); tokens.addElement(token);
	    findReference = false; findElement = true; 
	    if (DEBUG) System.err.println(ME + caller + " has " + left );

	 // findElement is true after opening delimiter or after "<element>," 
	 } else if ( findElement ) {
	     tokens.addElement(token);

	     // scan for next element or the right delimiter (as a string)
	     if ( needYear ) element = isNameAndYear ( st, tokens, sright );
	     else element = nextElement ( st, tokens, sright );
             if ( element == null ) {   // not what we are looking for
	        if ( st.hasMoreTokens() ) {
	           // There are more tokens left, but discard this proto-tag
	           findReference = true; findElement = false;
		} else break;         
	     } else { 
                // See what character terminated the <element>
	        if ( element.charAt(element.length()-1) == right ) {
		   Split stuff = new Split ( tokens );
		   if ( DEBUG ) System.err.println(ME+caller+" has element "
		      + stuff.toString());
		   if ( result == null ) {
		      result = new Vector();
		      anchors = new Vector();
		   }
		   result.addAll(stuff.getRefs());
		   anchors.addAll(stuff.getAnchors());
		   findReference = true;
		   findElement = false;
                }
             }  

	 } // findElement

      } catch (NoSuchElementException e) { break; } // no more tokens 

      }  // end while st has more tokens
      if ( DEBUG )
      System.err.println(ME+caller+" is out of tokens");

      if ( result != null ) hint = hintName;
      if (DEBUG) System.err.println(ME+"caller returning a vector of " 
	 + (result==null?0:result.size()) + " elements");
      return result;
   }                            // enclosedAcronym

   // isValid -
   /** determines whether this token is a valid reference.
    * @param the token
    * @returns true if the token is a valid reference
    * NOTE: figure out where/when to call this.  Maybe should
    * be in Split?
    */
   private boolean isValid ( String token ) {
      if ( token.length() > 2 ) return true;
      if ( Character.isLowerCase(token.charAt(0)) &&
	   Character.isLowerCase(token.charAt(1)) ) return false;
      return true;
   }                          // isValid


   // bracketsRef -
   /** handles references of the form [...].
   /*@returns a Vector of all the tags found in this context.
   * recognize things like [Besser,1994,Cringley,1996]
   * [Jones et al.,1999]  [Jones & Smith, 1999]
   * [Institution name, 1999] [Alvin,1998,Bailey,1999]
   * Also (just square brackets, not round brackets) recognize
   * references like [PRISM] or [Jones, Jones and Jones].  
   * (TBD) Strings like this should cause SQUARE_BRACKETS_AROUND_ACRONYMS
   * Also semi-colons instead of commas
   * If one of these formats was found and hint is -1, reset
   * hint to BRACKETS_AROUND_COMMAED_NAMES_AND_YEARS
   * Side Effect: defines anchors to be the strings that 
   * correspond to the references */

   private Vector bracketsRef( ) {
      if (DEBUG) System.err.println(ME+"in bracketsRef");
      return enclosedAcronym (
         "bracketsRef",                   // name of caller
         "[], ;",                         // the delimiters and tokens
         false,                           // do not need a year
         BRACKETS_AROUND_COMMAED_NAMES_AND_YEARS); // hint to be set
   } // bracketsRef

   // nextElement -
   /** Returns a string that contains something like 
   * " and MacNeil" or ", Smith".
   * 
   * "," and "delim" end an element.
   * The returned string includes the ending token.
   * Return null if the tokens do not comprise an acronym
   */
   private String nextElement 
   ( StringTokenizer st, Vector tokens, String delim ) {

      if (DEBUG) System.err.println(ME+"in nextElement");

      StringBuffer sb = new StringBuffer();  
      String token;

      while ( st.hasMoreTokens() ) {

      try {

	 token = st.nextToken();  // another name, blank, "," ";" or ']'

         // keep appending tokens until we get to a "]" 
         while ( st.hasMoreTokens() && !token.equals(delim) )  {
            sb.append(token); tokens.addElement(token); 
	    token = st.nextToken();
         }
        // either we are out of tokens or we ran into the delimiter
        sb.append(token);  // the right delimiter 
	tokens.addElement ( token );
        return sb.toString();

      } catch (NoSuchElementException e) { break; } // no more tokens 

      }  // end while st has more tokens
      if (DEBUG) System.err.println(ME+"nextElement ran out of tokens");
      return null;

   }             // nextElement

   // isNameAndYear
   /** Return a string that contains something like " 1994"
    * or " and MacNeil, 1996" or ", 1999" or ";1999"
    * " name, 1999" or " & Smith, 1999" or "1998a" 
    * (that is, the caller has already gobbled up the leading token)
    * 
    * "," and ";' and "delim" end a name-and-year element.
    * The returned string includes the ending token.
    * Return null if the tokens do not comprise a name and year
    * Special case for square brackets only: the year can be null
    * if next token if ']'
    * @param A Tokenizer, carries state of parse so far
    * @param List of tokens for actual anchor string
    * @param Either ")" or "]" to generalize the routine
    */

   private String isNameAndYear 
   (StringTokenizer st, Vector tokens, String delim ) {

      if (DEBUG) System.err.println(ME+"in isNameAndYear");

      StringBuffer sb = new StringBuffer();  
      String token;

      boolean findElement = true;
      boolean findYear = false;
      boolean nullYears = false;

      if ( delim.equals("]") ) nullYears = true;

      while ( st.hasMoreTokens() ) {

      try {

	 token = st.nextToken();  // another name, blank,";', "," year, or ']'
	 tokens.addElement ( token );

	 if ( findElement ) {
	    if (DEBUG) {
	       System.err.println(ME+"isNameAndYear, findElement true, token = "
	       + "->"+token+"<-");
	    }
	    // Replacing code here
	    // Keep appending tokens until we get to a year or "]" 
	    while ( st.hasMoreTokens() && !isYear(token)
		 && !token.equals(delim) ) {
		 sb.append(token); tokens.addElement(token);
		 token = st.nextToken();
	    }
	    if (DEBUG) System.err.println(
            "isNameAndYear scanned to end of tokens. sb: ->"+sb.toString()
	    +"<- and last token is about to be appended");

	    /* Either we are out of tokens or we have a year or we
	       ran into the delimiter. */
            sb.append(token);  // year or "]" or ")"
	    if ( token.equals(delim) ) {
	       if ( nullYears ) return sb.toString();
	       else return null;
	    }
            if ( !st.hasMoreTokens() ) break;
            findElement = false; findYear = true;
            if (DEBUG) System.err.println(ME+"isNameAndYear has <names>" 
	       + ".  Buffer is: " + sb.toString() );
	 } // findElement

	 // current token follows a < year >
         // find <year> terminator "," ";" or "delim".  Skip whitespace.
	 else if ( findYear ) {
           sb.append(token);  
           if ( token.equals(delim) || token.equals(",") 
		|| token.equals(";") ) {
              if (DEBUG) System.err.println(ME+"nameAndYear has year" 
		  + ", buffer is " + sb.toString() );
              return sb.toString();
           } else if ( !token.equals(" ") ) return null;
	 }

      } catch (NoSuchElementException e) { break; } // no more tokens 

      }  // end while st has more tokens
      if (DEBUG) System.err.println(ME+"isNameAndYear ran out of tokens");
      return null;

   }

   // Years are 2 digits or 4 digits followed by a lowercase letter
   protected boolean isYear (String token) {
      if (DEBUG) System.err.println(ME+" in isYear("+token+")");
      for ( int i=0; i<token.length(); i++ ) {
	 if ( !Character.isDigit(token.charAt(i)) ) {
	    char c = token.charAt(i);
	    return i==token.length()-1 && ( c == 'a' || c == 'b' || c == 'c' );
	 }
      }
      return token.length() == 2 || token.length() == 4;
   }

   private boolean isNumeric ( String token ) {
      for ( int i=0; i<token.length(); i++ ) {
	 if ( !Character.isDigit(token.charAt(i)) ) return false;
      }
      return true;
   }

   // recognize things like [POW99] or [POS8,POS9] or [Atk00] 
   // or [MBC+89] or [RDF] or [W3C] or [CS98a,CS98b]
   private Vector squareAcronym( ) {
      return enclosedAcronym(
	 "squareAcronym",                    // name of calling routine
	 "[]",                               // delimiters
	 false,                              // do not need a year
	 SQUARE_BRACKETS_AROUND_ACRONYMS);   // hint that could be set
   }

   // cleanup -
   /** Trims a string of 's, whitespace, commas, semi-colons, [], and () and
   * return it.  Suitable for cleanup up individual references.
   * Bracketed tag values are added to refsInText.
   * @param String to be cleaned up.
   * @returns the cleaned up string, which may have been split into 
   * separate anchors.
   */
   protected static String cleanup ( String s ) {
      if (DEBUG) System.err.print(ME+"cleanup of " + s);
      s = s.trim();

      int j=0; char ch = s.charAt(j);
      while ( j < s.length() && (
	  ch == ';' || ch == ',' || ch == '[' 
	     || ch == '{'
	     || ch == '(' ) ) ch = s.charAt(++j);
      int k=s.length()-1; ch = s.charAt(k);
      while ( k >= 0 && (
	  ch == ';' || ch == ',' || ch == ']' || ch == '}' || ch == ')' ) ) 
	    ch = s.charAt(--k);
      String t = removeLowerCaseWords ( s.substring(j,k+1));
      if (DEBUG) System.err.println(" to [" + t + "]" );
      return "[" + t + "]";   
   }             // cleanup

   //   removeLowerCaseWords - 
   /** Sometimes references have lots of leading words that don't
    *  really belong to the anchor.  Strip off these words.
    *  @param Proposed reference anchor
    *  @returns the reference anchor stripped of leading words
    */
   private static String removeLowerCaseWords ( String t ) {
      int i=0;     // for computing a postfix of t
      // strip off leading lower-cased words
      char c = t.charAt(i=0);
      while ( Character.isLowerCase(c) ) { 
	 int iBeg = i;
	 while ( Character.isLetter(c=t.charAt(++i)));
	 if ( Utils.lowerCaseNameComponent(t.substring(iBeg,i)) )
	    return t.substring(iBeg);
	 while ( !Character.isLetter ( c = t.charAt(++i) ));
      }
      return t.substring(i);
   }         // removeLowerCaseWords

   // expand - 
   /** helper function to turn a string like "[1,2,3]" into
   * "[1][2][3]" and [4-6] into [4][5][6]
   * If there are no commas in "reference" the whole string is returned
   * Guaranteed: this is a SQUARE_BRACKET type of reference string
   * Note: As a side effect, the anchors Vector is appended to, once for
   * each expanded reference.
   *@param The reference, like "[4-6]"
   *@param The anchor, like "4-6"
   *@returns A String like "[4][5][6]"  */

   private String expand ( String reference, String anchor ) {
System.err.println(ME+"in expand, with anchor = " + anchor);
      if (DEBUG) System.err.print(ME+"expanded refstring " + reference);
      String result="";
      int i=-1; int m,n,low,high;

      while ( (i=reference.indexOf("-")) != -1 ) {
	 String expanded = reference.substring(0,i); // up to "-"
	 m=n=i;
	 while ( Character.isDigit(reference.charAt(--m)));
	 low = Integer.valueOf( reference.substring(m+1,i)).intValue();
	 while ( Character.isDigit(reference.charAt(++n)));
	 high = Integer.valueOf( reference.substring(i+1,n)).intValue();
	 System.err.print(" with range " + low + " to " + high);
	 for ( int k = low+1; k<high; k++) expanded += "," + k;
	 expanded += "," + reference.substring(i+1);
	 reference = expanded; expanded="";
	 System.err.print(" to " + reference);
      }

      anchors.addElement ( anchor );
      while ( (i=reference.indexOf(",")) != -1 ) {
	 result += reference.substring(0,i)+"]";
         anchors.addElement ( anchor );
	 reference= "[" + (reference.substring(i+1)).trim();
      }

      if (DEBUG) System.err.println(" into " + result + reference
          + " current number of anchors for this context: "
          + anchors.size() );
      return result+reference;
   } // expand

   /** lastWord -
     * Given a string s and in index in the string s, find the
     * place where the last word in that string begins
     * Accept apostrophe, comma and dashes as parts of name
     */
   private int lastWord ( String s ) {
      if ( s == null ) return -1;
      int i = s.length();  if ( i == 0 ) return -1;

      // trim off trailing blanks
      while ( i>=0 && Character.isWhitespace(s.charAt(--i)));
      if ( i < 0 ) return -1;
      s = s.substring(0,i+1); 

      // move i from last non-blank left to first blank
      char c = s.charAt (i);
      while ( i>=0 && Character.isLetter(c) ) c = s.charAt ( i-- );
      System.err.println(ME+"lastWord, after finding first blank, i="+i);
      if ( i < 0 ) {      // whole string is the name
	    // case 1: c is not a letter, i = -1 (letter at 0 is blank)
	    if ( Character.isWhitespace(c) ) {
	       if (DEBUG) System.err.println(ME
	       +"loneYear, lastWord is -a>" + s.substring(i+1)  + "<-" );
               return i+1;
	    } // ran off the line without finding non-letter
	    if (DEBUG) System.err.println(ME
	    +"loneYear, lastWord is -b>" + s + "<-" );
            if ( Character.isLetter(c) ) return 0; else return -1;
      } else if ( i == 0 ) {  // whose string is blanks plus the name
	    if (DEBUG) System.err.println(ME
	    +"loneYear, lastWord is -c>" + s.substring(i+1)  + "<-" );
           return i+1;
      } else {
	 if ( c == '\'' || c =='-' || c ==',' )  // keep going
	    return lastWord ( s.substring(0,i) );
	 else if ( i >= s.length()-2 ) return -1;
	 else { // c is whitespace or punctuation and there is stuff after it
	    if (DEBUG) System.err.println(ME
	    +"loneYear, lastWord is -d>" + s.substring(i+2) + "<-" );
	    return i+2;   // c is whitespace or punctuation
	 }
      }
   }         // lastWord


   // loneYear -
    /** A lone year, e.g. "(year)" has been found in the text at offset.
    * The length is 4 for "(year)" and 5 for (yeara)
    * Return null if this is not part of a reference anchor.
    * Otherwise this could be an anchor being used as a part of speech.
    * Return the anchor in the form "[name-list, year]"
    * Side effect: if non-null returned, anchors has been updated
    * with the string "name-list (year)"
    */

   private String loneYear ( String s , int offset, int length ) {

      // i backs up along the string until it is at the start of the anchor
      // offset is updated to point at the previous token found that looked
      // like part of the anchor.  Initially buffer is "[year]"

      if ( offset < 1 ) return null;   // nothing before "(year)"
      int origOffset = offset;
      StringBuffer sb = new StringBuffer
	 ( "["+s.substring(offset+1,offset+length+1)+"]" );
      int i = offset;
      s = s.replace('\n',' ');
      if ( DEBUG ) {
	 System.err.println(ME+"loneYear working with ->"
	    + s.substring(0,offset) + "<-" );
	 System.err.println(ME+"loneYear buffer: " + sb.toString());
      }

      // k points to last word before the offset (-1 if there are no words). 
      int k = lastWord ( s.substring(0,offset) ); 
      if ( k == -1 ) return null;
      char c = s.charAt(k);
      if ( k>=4 && s.substring(k-4, k+6).equals("and others") ) {
	 sb.insert(1,"and others, ");
	 offset = k-4; k=k-4;
         if ( DEBUG ) 
	 System.err.println(ME+"loneYear buffer: " + sb.toString());
      } else { // check that lastWord is a name
         if ( Character.isLowerCase(c) ) return null;  // not an anchor

         // insert the name in the buffer, giving "[Name, year]"
         String name = s.substring(k,offset).trim();
         if ( name.endsWith("'s"))name=name.substring(0,name.length()-2);
         sb.insert(1, name + ", ");
         offset = k;
         if ( DEBUG ) System.err.println(ME+"loneYear buffer: " + sb.toString());
         if ( k == 0 ) {
	    if ( anchors == null ) anchors = new Vector();
            anchors.addElement ( s.substring (k, origOffset+length+1) );
            return  sb.toString() ;
         }
      }

      // back up over this name and look for "and" 
      c = s.charAt ( --k );
      while ( k>0 && Character.isWhitespace(c) ) c = s.charAt ( --k );
      System.err.println(ME+"loneYear ->" + s.substring(0,k+1) + "<-" );
      if ( k>=4 && s.substring(k-4, k+1).equals("and others") ) {
	 sb.insert(1,"and others");
	 offset = k-4;
         if ( DEBUG ) 
	 System.err.println(ME+"loneYear buffer: " + sb.toString());
      } else if ( k>=2 && s.substring(k-2,k+1).equals("and") ) {
	 sb.insert(1,"and ");
	 offset = k-2;
         if ( DEBUG ) 
	 System.err.println(ME+"loneYear buffer: " + sb.toString());
      }  
      // get preceding word, set buffer to "[Word Word, year]"
      while ( true ) {
         k = lastWord ( s.substring(0,offset) );
         if ( k == -1 ) {
	    if ( anchors == null ) anchors = new Vector();
            anchors.addElement ( s.substring(0, origOffset+length+1));
            return sb.toString(); // return "[Name, year]"
         }
         c = s.charAt(k);
         if ( Character.isLowerCase(c) ) {
	    if ( anchors == null ) anchors = new Vector();
            anchors.addElement ( s.substring(k+1,origOffset+length+1) );
	    if ( DEBUG ) 
	    System.err.println(ME+"loneYear returning " + sb.toString());
            return sb.toString();
         }
	 String word = s.substring(k,offset).trim();
	 if ( word.equals("See") ) {
	    if ( anchors == null ) anchors = new Vector();
            anchors.addElement ( s.substring(k, origOffset+length+1) );
            return sb.toString();
         }
         sb.insert( 1, word + " ");
         if ( DEBUG ) 
	    System.err.println(ME+"loneYear buffer: " + sb.toString());
         offset = k;
      } // while true
   }  // loneYear


   // endRef -
   /** Closes off the current reference being built in a list-of-anchors
   * @param Vector of canonicalized references in this context
   * @param StringBuffer holding the literal anchor(s)
   * @param StringBuffer holding the canonicalized anchors
   * Note that this routine should be called when we have
   * "...year," or ";" or list-of-anchors with no " and "
   * @returns new Vector of canonicalized references
   * Note: side effects include: one more reference added to result,
   * one more reference added to "anchors"
   * sb = "[", anchorBuf = ""
   */

   private Vector endRef 
   ( Vector result, StringBuffer anchorBuf, StringBuffer sb ) {
      sb.append ("]");
      if (DEBUG)
      System.err.println ( ME + "in endRef with buffer ->"+sb.toString()
            +"<-");
      if ( result == null ) {
         result = new Vector();
         anchors = new Vector();
      }
      result.addElement ( cleanup(sb.toString()) );
      anchors.addElement ( anchorBuf.toString());
      sb.delete(0,sb.length()); sb.append("[");
      anchorBuf.delete(0,anchorBuf.length());
      return result;
   }

   // mergeContexts -
   /** Given the existing contextTree and the current sentence, add
    *  this sentence to the last context in the vector if there is
    *  one, else just start up a new vector
    */
   private void mergeContexts (Vector contextTrees) {

      if (DEBUG) System.err.print(ME+ "mergeContexts...");
      if ( contextTrees.size() == 0 ) {
         contextTrees.addElement ( this );
         if (DEBUG) System.err.print("added");
      } else {
         ((SentenceTree)contextTrees.lastElement()).append(this);
         if (DEBUG) System.err.print("appended");
      }
      if (DEBUG) {
         System.err.println(" sentence tree to previous context ");
         System.err.println(ME+getLinks());
      }
   }

   // instance variables (for this sentence) needed for updateContextTrees
   protected int whichFALSESTOP = -1;
   protected boolean hasReferences = false;  // does this sentence contain refs

   // updateContextTrees -
   /**
    * Given the existing contextTrees, the current sentence, and settings
    * for whether there are references, false stops, or false starts 
    * involved, either append the sentence to the previous context,
    * start a new context, or drop it altogether.
    *
    * Append or Add - start a new SentenceTree (return true)
    * Drop - reset and reuse current SentenceTree (return false)
    *
    * As a side effect of calling this routine, the argument vector
    * can be altered.  It can come back one element shorter, the
    * same size, or one element longer.
    *
    * That's because we can wind up with contextTrees that contain no 
    * references:
    * Add a sentence because it has a potentially false stop.
    * Case 1: Next sentence does not continue.  We should delete the
    * last element from contextTrees, but currently do not.
    * Case 2: Next sentence does continue, but contains no references
    * nor does it end with a potential false stop.  We should discard
    * this sentence along with the last element from contextTrees.
    *
    * ReferenceSection has been repaired to not gag on contexts which
    * have no references in them.
    */
   protected boolean updateContextTrees (Vector contextTrees) {

      boolean result;
      SentenceTree prevContext = null;
      int prevFalseStop = -1;

      // Pick up some values to work with
      hasReferences = findReferences();
      whichFALSESTOP = endsWith(FALSESTOPS);
      if ( contextTrees.size() > 0 ) {
         prevContext = (SentenceTree) contextTrees.lastElement();
         prevFalseStop = prevContext.whichFALSESTOP;
      }

      if ( DEBUG ) {System.err.print (ME +
	 "updateContextTrees... whichFALSESTOP, prevContext: " 
	 + prevFalseStop+", " );
	 if ( prevContext==null ) System.err.println("is null");
	 else System.err.println(""+prevContext.hasReferences);
      }

      // Determine whether we delete the previous context
      if ( prevContext != null && !prevContext.hasReferences ) {

	 // Previous context was saved because it ended with a potential stop
	 if ( (prevFalseStop == -1) || !beginsWith( FALSESTARTS[prevFalseStop] ) ) {
	    contextTrees.remove(contextTrees.size()-1);
	    if (DEBUG) System.err.println(ME+"updateContextTrees "
	    + "removing bogus context.  contextTrees has "
	    + contextTrees.size() + " elements");
	 }
      }

      // Determine what to do with this sentence
      if ( hasReferences ) {

	 if ( prevFalseStop != -1 ) {

	    // This sentence contains references.  The last sentence had
	    // a false stop.

	    if ( beginsWith ( FALSESTARTS[prevFalseStop] ) ) {

	       // This sentence contains references.  The last sentence
	       // had a false stop which this sentence continues.
	       // Concatenate this sentence to the previous context.
	       // Update settings, and start a new tree.

	       mergeContexts(contextTrees);

	    } else {

	       // This sentence contains references, but does not continue
	       // the previous context.  Therefore add the sentce to the
	       // contextTrees vector and update settings and start a new tree.

               contextTrees.addElement ( this );
               if (DEBUG) {
                  System.err.println(ME+ "added sentence tree to contextTrees");
                  System.err.println(ME+getLinks());
               }

	    } // if last sentence had false stop

	 } else {

	    // This sentence contains references, previous context
	    // did not have a false stop, so add the sentence to the 
	    // the tree and update the settings.

            contextTrees.addElement ( this );
            if (DEBUG) {
               System.err.println(ME+ "added sentence tree to contextTrees");
               System.err.println(ME+getLinks());
            }
	 } // if last sentence did not have a false stop
         //sentence = new SentenceTree ( sentence );
         result = true;    // caller should leave this Sentence Tree be

      } else {                  // no references in this sentence

	 if ( prevFalseStop != -1 ) {  // last sentence has false stop

	    if ( beginsWith ( FALSESTARTS[prevFalseStop] ) ) {

	       // this sentence has no references, last sentence has a
	       // false stop, and this sentence continues last sentence.
	       // Concatenate sentence with previous context

	       if (DEBUG) { System.err.println(ME+"updateContexts..." +
	          "beginsWith returned true. Merging sentence with " +
	          "previous context"); 
	          System.err.println(ME+getLinks());}
	       mergeContexts(contextTrees);
	       //sentence = new SentenceTree ( sentence );
               result = true;  // leave this Sentence Tree be

             } else { 

	       // This sentence has no references, the last sentence had
	       // a potentially false stop.  But this sentence does not
	       // continue the previous one.  So add or drop.
	     
	       if ( whichFALSESTOP != -1 ) {
		  
		  // This sentence ends with a potentially false stop.  
		  // Therefore add it to the list of contexts, in case 
		  // the next sentence contains a reference.
		  contextTrees.addElement(this);
                  if (DEBUG) {
                     System.err.println(ME+ "added sentence tree to "
		        +" list of context trees. ");
                     System.err.println(ME+getLinks());
		  }
	          //sentence = new SentenceTree ( sentence );
                  return true;  // leave this Sentence Tree be

	       } else result = false;

	    }  // last sentence had a false stop

	 }  else {

	       // This sentence has no references, the previous sentence
	       // did not have a false stop, so it must have had references.
	       if ( whichFALSESTOP != -1 ) {

		  // This sentence has no references and it does not 
		  // continue the previous sentence.  But this sentence
		  // has a potentially false stop, so add it to the
		  // contextTrees.
		  contextTrees.addElement(this);
                  if (DEBUG) {
                     System.err.println(ME+ "added sentence tree to "
		        +" list of context trees. ");
                     System.err.println(ME+getLinks());
		  }
	          //sentence = new SentenceTree ( sentence );
                  result =  true;

	       } else {

		  // This sentence has no references and it does not
		  // continue the previous context, and it has no potentially
		  // false stop.  So, drop it.
                  //sentence.reset();       // reset and reuse this tree
                  result =  false;
	       }
         } // last sentence had no false stop
      }  // no references in this sentence

      if ( DEBUG ) System.err.println (ME +
	 "returning " + result + " from updateContextTrees... whichFALSESTOP: "
	    + whichFALSESTOP +
	    (whichFALSESTOP==-1?"":" ("+FALSESTOPS[whichFALSESTOP]+")"));
     
      return result;

   }   // ends updateContextTrees

   // =================  Internal Classes ===========================

   // TBD This class is no longer needed, but then addNOde has to be
   // rewritten.  addNode is called by XHTMLAnalyzer.  And it is 
   // useful for concatenating trees.
   protected class Node {
     
	protected String value;
	protected int type;
	protected Node parent,son,next;
     
	public Node(String v, int t, Node p, Node s, Node n) {
	   parent = p; son = s; next = n; value = v;
	   type = t;
	}

	// recursive dump in preorder (root first)
	protected String dump() {
	   String result="";
	   String quote = "\"";
	   if (type == TEXT) result = value;
	   if (type == TAG) result = "<"+value+" ";
	   if (type == HREF) result = "href="+quote+value+quote+">";
	   if (type == ENDTAG) result = "</"+value+">";
	   if (son != null) result=result+son.dump();
	   if (next != null) result=result+next.dump();
	   return result;
	}

	// recursive dump in preorder of only the text of the tree
	protected String text() {
	   String result="";
	   String quote = "\"";
	   if (type == TEXT) result = value;
	   if (son != null) result=result+son.text();
	   if (next != null) result=result+next.text();
	   return result;
	}

	// dump first text node only.  Returns "" if this tree contains
	// no nodes with text in it.
	protected String firstText() {
	   String result="";
	   if ( type == TEXT ) return value;
	   if ( son != null ) result = son.firstText();
	   if ( result.equals("") && next != null ) 
	      result = next.firstText();
	   return "";
	}

   } // Node

   // Link is a structure for collecting information about a potentially
   // linkable reference.  Initialized by findReferences.
   // TBD: this class is no longer needed
   private class Link {

      private String refTag;                 // tag as it appears in References
      private int contextTreeNumber;         // which context was examined
      private String refInContext;           // tag as it appears in context
      private Node refNode;                  // tree to be replaced
      private Node replNode;                 // replacement tree (an XLink)

      private Link ( int ctn, Node refNode, String refInContext ) {
         this.contextTreeNumber = ctn;
	 this.refInContext = refInContext;
	 this.refNode = refNode;
      }

   }

   // internal class to split a reference combination into individual
   // ones.  That is, "[F97,  IAB]" becomes two: "[F97][IAB]"
   // The usual case is the a Split object is only a single reference.
   protected class Split {

      private Vector anchors = null;     // literal anchor
      private Vector refs = null;        // canonical reference
      private Vector tokens = null;      // the tokens
      private String ldelim = null;      // left delimiter
      private String rdelim = null;      // right delimiter

      // tokens of the reference, including the delimiters, which
      // are the first and last elements in "tokens"

      // Constructor -
      /** analyzes the reference, builds "anchors" and "refs"
      * @param the Vector of tokens (which are Strings), including
      * the delimiters, which are the first and last elements of
      * the vector.  The constructor uses this vector to set up
      * the vector of references (usually only one string) and
      * the associated vector of literal anchors.
      */
      public Split ( Vector t ) {
         if ( DEBUG ) System.err.println (ME+" constructing a Split");
	 if ( t == null ) return;

	 tokens = t;
	 anchors = new Vector();
	 refs = new Vector();
	 ldelim = (String)tokens.elementAt(0);
	 rdelim = (String)tokens.elementAt(tokens.size()-1);
	 if ( DEBUG ) System.err.println (ME+"tokens: " + toString()
	    + " Delimiters are '" + ldelim + "' and '" + rdelim + "'");
	 String s = toString();
         int i=0;
	 if ( noEtal(s) ) {
	    int nYears = 0;
	    Enumeration e = tokens.elements();
	    boolean wasYear = false;
	    i = 0;
	    while (i < tokens.size() ) {
	       String token = (String)tokens.elementAt(i);
	       if ( isYear( token ) ) {
		  nYears++; wasYear = true;
	       } else if ( token.equals(",") && wasYear ) {
                  tokens.removeElementAt(i);
                  tokens.insertElementAt(rdelim,i);
                  tokens.insertElementAt(ldelim,i+1);
                  wasYear = false;
                  if ( DEBUG )
                  System.err.println
                      (ME+"-Split, i="+i+": " + toString());
                  i++;
               }
               i++;
	    }
	    if ( nYears == 0 ) {
	       if ( hint == BRACKETS_AROUND_COMMAED_NAMES_AND_YEARS )
		  hint = SQUARE_BRACKETS_AROUND_ACRONYMS;
	       i=0;
	       while ( i<tokens.size() ) {
		  String token = (String)tokens.elementAt(i);
		  if ( token.equals(",") || token.equals(";") ) {
		     tokens.removeElementAt(i);
		     tokens.insertElementAt(rdelim,i);
		     tokens.insertElementAt(ldelim,i+1);
		     if ( DEBUG )
		     System.err.println
			 (ME+"-Split, i="+i+": " + toString());
		     i++;
		  }
		  i++;
	       } // nYears == 0
	    } else if ( nYears > 1 ) { 
	          // TBD: modify vector by inserting delimiters if necessary
	    } // nYears > 1
	 } // if noEtal
	 cleanup();  // copy modified tokens vector to destination
      }                   // split

      public Vector getAnchors () { return anchors; }
      public Vector getRefs() { return refs; }

      // copy the revised Vector of tokens into the class vectors
      private void cleanup ( ) {
	 String token;
	 Enumeration e = tokens.elements();
	 StringBuffer rString = new StringBuffer ((String)e.nextElement());
	 StringBuffer aString = new StringBuffer();
	 while ( e.hasMoreElements() ) {
	    token = (String)e.nextElement();
	    rString.append(token);
            if ( token.equals(rdelim) ) {
	       refs.addElement(rString.toString());
	       anchors.addElement(aString.toString());
	       if ( e.hasMoreElements() ) {
	          rString = new StringBuffer();
	          aString = new StringBuffer();
	       }
            } else if ( !token.equals(ldelim) ) aString.append(token);
	 }
      }    // cleanup

      public String toString() {
	 Enumeration e = tokens.elements();
         StringBuffer sb = new StringBuffer();
	 while ( e.hasMoreElements() )
	    sb.append( (String)e.nextElement());
	 return sb.toString();
      }

   // noEtal -
   /** determines whether this reference is a conjunction
   * @param the String that is the reference, e.g. "[Booze and Allen, 1999]"
   * @returns true if the string is a conjunction
   */
   private boolean noEtal ( String ref ) {
      return (ref.indexOf(" and ") == -1) &&
	     (ref.indexOf("&") == -1)     &&
	     (ref.indexOf("et al.") == -1)     &&
	     (ref.indexOf("et. al") == -1)     &&
	     (ref.indexOf("et. al.") == -1);
   }

   }  // ends class Split

} // SentenceTree

