// bergmark - December 2000 - The refernce linking project

package Linkable.Analysis;

// Context Section

/** The ContextSection class is responsible for handling start tags,
 *  end tags, and text while scanning the text of the paper.
 *  One instance of this class is constructed at the beginning of analysis.
 *  The main information encapsulated by it is the format of Section
 *  Headers for this document.  
 */

// Update History:
// 2001-06-06:  Generalized algorithm for parsing SH start sequences
// 2001-06-21:  Implement the addition of extraneous tags within a subsection
//              header sequence in checkForHeader()
// TBD:
// Make this less HTML specific by feeding in possible sequences
// at constructor time.

// for AttributeList
import org.xml.sax.*;          // needs xml-tr2/xml.jar in $CLASSPATH

import Linkable.Utility.CONFIG;

import java.util.Vector;

public class ContextSection {

   private static String ME = "ContextSection: ";
   private static boolean DEBUG = CONFIG.DEBUG;

   private Vector sectionName = new Vector(); // <"h3"> or <"p","b"> or...
   private boolean seqComplete = false;       // true if one of these obtains
   private boolean isSectionHeader = false;   // next text is a header
   private Vector[] possibleSHSeqs = new Vector[5]; // 5-element array of Vecs
   private int[] min_max = {0,4}; // current range of possibleSHSeqa
   private int index = 0;         // next character in sectionName to match
   private int tagsMatched = 0;   // number of tags matching possible seq so far

   public ContextSection() {
      super();
      if ( DEBUG ) System.err.println(ME+"in constructor...");
      // initialize the list of possible Section Header start sequences
      // Each element of the array is a vector of Strings.
      // It is an ERROR for one string sequence to contain another
      String[] v1 = {"h3"};
      String[] v2 = {"h4"};
      String[] v3 = {"h5"};
      String[] v4 = {"font","size","+1"}; String[] v5 = {"strong"};
      String[] v6 = {"p"}; String[] v7 = {"b"};
      for ( int i=0; i< possibleSHSeqs.length; i++ )
	 possibleSHSeqs[i] = new Vector();
      possibleSHSeqs[0].addElement(getVector(v1));
      possibleSHSeqs[1].addElement(getVector(v2));
      possibleSHSeqs[2].addElement(getVector(v3));
      possibleSHSeqs[3].addElement(getVector(v4));
      possibleSHSeqs[3].addElement(getVector(v5));
      possibleSHSeqs[4].addElement(getVector(v6));
      possibleSHSeqs[4].addElement(getVector(v7));
      if ( DEBUG ) {
	 System.err.println(ME+"leaving constructor...\n"
         + "Here are the section header start sequences:");
	 for ( int i=0; i<possibleSHSeqs.length; i++ ) {
            System.err.println(possibleSHSeqs[i].toString());
	 }
      }
   }

   public String toString() {
      String name = "empty";
      if ( sectionName.size() != 0 ) name = sectionName.toString();
      return ME+"section name: " + name
      + ", isSectionHeader: " + isSectionHeader
      + ", tagsMatched: " + tagsMatched
      + ", index: " + index
      + ", seqComplete: " + seqComplete ;
   }

   // ================   protected routines ==========================

   // handleStartTag -
   /** determines whether this tag might end an author section and
   * start the context process.  Called only when "startAuthor" is true
   * and at least one author has been found already;
   * and we are not inside a table.
   * (TBD: and we have processed fewer than SOME-PARAMETER start tags)
   * Section Headers can be H3, H4, H5, strong, font size=+1, and
   * <p><b>
   * @param name of start tag
   * @param attributes of the start tag
   * @returns true if this is in fact a section header, end of authors
   * Note: sideffects include sectionName is now defined, seqComplete
   * is set to true.  
   */
   protected boolean handleStartTag ( String name, AttributeList attrs ) {
      if ( DEBUG ) System.err.println(ME+"in handleStartTag <"+name+">");
      if ( name == null ) {
         System.err.println(ME+"complaining loudly, because arg to "
	 + "handleStartTag is null!");
         return false;
      }
      if ( DEBUG ) System.err.println(toString());

      if ( seqComplete ) {
         // We know what the SH tags are supposed to be.
         if ( index >= sectionName.size() ) {
	    if ( DEBUG )
	    System.err.println(ME+"found extraneous tag");
            return true;
         } else {
	    if ( name.equals (sectionName.elementAt(index++))) {
	       while ( index < sectionName.size() &&
                       sectionName.elementAt(index) != "," ) index++;
	       if ( DEBUG )
	       System.err.println(ME+" at end of sequence: "
	       + (index == sectionName.size()));
	       return ( isSectionHeader = (index == sectionName.size()) );
	    }
	    else return false; // failed to match the known sequence
	 }
      }

      // We do not yet know what SH tags are used in this item
      if ( matchesAtLeastOne (name,attrs)) {
	 for (int i=min_max[0]; i<=min_max[1]; i++) // this is the match
	    // if we have all the tags(accounting for "," in sectionName)
            // we are done
	    if ( sectionName.size() == 2*possibleSHSeqs[i].size()-1 ) {
	       seqComplete = true;
	       isSectionHeader = true;
	    }
	 // if we get here, we are on track, but not done yet
      } else  abortSeqBuilding();
      if (DEBUG) System.err.println(ME+toString());
      return isSectionHeader;
   }

   // handleEndTag -
   /** determines whether this is the end of a section header.
   * @param name of the end tag
   */
   protected void handleEndTag ( String name ) {
      if ( DEBUG )
      System.err.println(ME+"in handleEndTag <"+name+">");
      isSectionHeader = false;
      if ( index <= 0 ) return;
      if ( !seqComplete ) {
         abortSeqBuilding(); // in case we were discovering a section header
	 return;
      }  
      // assert that name is the same as the last element in sectionName
      if ( DEBUG ) {
	 System.err.println(ME+"in handleEndTag with tag <"+name+"> "
	 + ", top element in stack is: <"
	 + (String)sectionName.elementAt(index-1) + ">"
	 + ", and index = " + index);
      }
      if ( name.equals ( (String)sectionName.elementAt(index-1) ) ) 
	 if ( index > 1 ) index -= 2; else index --;
   }

   //                     checkForHeader -
   /** determines whether this start tag is marking off a section header.
   * called only when "doContexts" is true.  index is next position in
   * sectionName to be matched by this tag.  This is sort of a handle
   * StartTag.  */

   protected void checkForHeader ( String name, AttributeList attrs ) {
      if ( index < sectionName.size() &&
	   name.equals(sectionName.elementAt(index)) ) {
	   index ++;
	 // move index off the vector or beyond the next ","
	 while ( index < sectionName.size() &&
		 !sectionName.elementAt(index++).equals(","));
	 isSectionHeader = (index >= sectionName.size());
      } else { 
	 if ( index >= sectionName.size() ) {
	    // ignore extraneous tags
	    isSectionHeader = true;
	 } else {
	    isSectionHeader = false; index = 0; 
	 }
      }
      if ( DEBUG ) System.err.println(ME+" in checkForHeader <" + name 
      + "> isSectionHeader set " + isSectionHeader);
   }

   // handleText -
   /** called while scanning authors whenever text is encountered.
    *  Any text aborts possible section header that was only partially
    *  scanned.  */
   protected void handleText() { 
      if ( index < sectionName.size() ) index = 0;
   }

   // isReference -
   /** determines whether this is a reference section, meaning that
   * we transition from doing contexts to doing references
   * @returns true if we should make the transition
   */
   protected boolean isReference ( String text ) {
      if ( isSectionHeader && ReferenceSection.isRefSectionHeader ( text )) {
         if ( DEBUG ) System.err.println ( ME + "found Reference Section");
         return true;
      }
      else return false;
      
   }

   // Delete sequence -
   /** Removes one sequence, if present, from the array of possible
    *  sequences for starting a section header.  Called by AuthorSection
    *  after the author start sequence has been ascertained, since it
    *  should not be possible to have authors and section headers starting
    *  with the same sequence.
    */
    protected void deleteSequelnce ( Vector v ) {
       for (int i=0; i<possibleSHSeqs.length; i++) {
	  // TBD: if v contains the same strings as possibleSHSeqs[i]
	  // then set possibleSHSeqs[i] to null or a special sequence
	  // Could get fancy and swap the last element with the [ith]
	  // element
       }
    }

   // ================   private routines ==========================

   // set up a vector of strings
   private Vector getVector ( String[] arg ) {
      Vector v = new Vector ( arg.length );
      for (int i=0; i<arg.length; i++ ) v.addElement(arg[i]);
      return v;
   }


   // This is not a potential section header sequence
   private void abortSeqBuilding () {
      sectionName.clear();
      min_max[0] = 0; min_max[1] = possibleSHSeqs.length-1;
      tagsMatched = 0;
      if ( DEBUG )
      System.err.println(ME+"aborted sequence building.\n"+toString());
   }

   /** Determine whether this is a possible section header sequence
    * @param Vector of strings that have been encountered so far
    * Global variable min_max is the lowest and highest indices
    * in possibleSHSeqs that we might be matched.
    * @returns true if this is a possible prefix of at least one
    * of the sequences.
    * If sectionName has some elements in it, those elements have 
    * already matched a possible section header between min and max.
    * Assert: tagsMatched is number of "," in sectionName
    */

   private boolean matchesAtLeastOne ( String name, AttributeList attrs ) {

      if ( DEBUG ) 
      System.err.println(ME+"matchesAtLeastOne with " + sectionName.toString()
      + ", tag <"+name+">, min_max is "+min_max[0]+":"+min_max[1]);

      for ( int i=min_max[0]; i<=min_max[1]; i++ ) {

	 Vector possible = possibleSHSeqs[i]; // a Vector of vectors
	 if ( DEBUG )
	 System.err.println(ME+"matching ("+i+"): " + possible.toString());
	 if ( tagsMatched < possible.size() ) {
	    // determine whether "name" is next tag in this sequence
	    if ( f((Vector)possible.elementAt(tagsMatched),name,attrs) ) 
	       return true;
	 } 
	 // reach here if possible sequence i did not match candidate
	 // Possibly because sequence i cannot take another take another tag
	 if ( i == min_max[0] ) min_max[0]++;
	 if ( i == min_max[1] ) min_max[1]--;
	 // just go to the next i
      }
      // reach here if none of the possible sequences matches
      return false;
   }

   /** Determine whether this tag/attributes matches the appropriate
    *  Vector in this possible sequence
    *  @return true if this tag fits with at least one sequence.
    */
   private boolean f (Vector v, String name, AttributeList attrs ) {
      // if tag and attributes do not match the vector return false
	 if ( v.size() == 1 ) {
	    if (!name.equals(v.elementAt(0))) return false;
	 } else {      // name matches.  Check the attributes.
	    if ( attrs == null ) return false;
	    String attrName = (String)v.elementAt(1);
	    String attrValue = (String)v.elementAt(2);
	    String x = attrs.getValue(attrName);
	    if ( !attrValue.equals ( x ) ) return false;
	 }
      // tag/attributes match given Vector
	 if ( sectionName.size() > 0 ) sectionName.addElement(",");
	 sectionName.addElement(name);
	 // TBD: if attributes are part of it, add attr name and value
	 tagsMatched++;
	 if ( DEBUG )
	 System.err.println(ME+"f returns true");
	 return true;
   }

} // ends Class ContextSection

