// bergmark - September 2000 - Reference Linking Project

package Linkable.Analysis;

// AuthorSection

/** The AuthorSection class is responsible for collecting and
  * analyzing the contents of the author section of an online
  * paper.  The AuthorSection should pretty closely follow the
  * title of the paper.

  * Each author consists of first name or intials and a last
  * name, plus perhaps "von" and honorifics.  There may be 
  * a commaed list on one line, or there may be one name per
  * line.

  * It is safe to assume that in the Author Section, no names
  * are written as last name first.  

  * The author names are likely to be interspersed by rolenames
  * (e.g. professor), institutions, and email addresses.

  * All this code has been moved here from the various parsers.  Code in
  * this class should be totally independent of whether an HTML, XHTML,
  * ASCII, or Postscript document is being parsed.  This code works with
  * unmarked up hunks of text, as well as with tags represented as
  * strings.
  */

// Update History:
// 2000-09-21: Created this file from code previously in XHTMLAnalyzer
// 2000-10-23: Fixed bug that made analysis fail to parse "Adam Chandler and
//             Dan Foley" and "name1, name2 and name3"
// 2000-11-15: Fixed bug in handleAuthor that failed to parse "name1, name2"
// 2001-04-18: Exclude authors like "by"
// 2001-05-30: Add the "gotAuthors" routine
// 2001-06-03: Major additions to handle tags and more circumstances.

import Linkable.Utility.Author;
import Linkable.Utility.AuthorDatabase;
import Linkable.Utility.CONFIG;

import java.util.Vector;

import org.xml.sax.AttributeList;

/**
 *  This class handles the parsing of the author section of a document.
 *  It decides whether or not to stay in the author section and whether
 *  or not to skip text or to look for an author name.  The constructor
 *  specifies an array of tag strings that should be used to look for a
 *  potential author or author list.  SAX events (handleStartTag, 
 *  handleEndTag, and handleText) are mirrored here in the AuthorSection,
 *  which handles it specifically for the author section.  None of these
 *  routines is called unless it is known that we are parsing the author
 *  section.  After one of these routines returns false, they will not
 *  be called again for this paper.
 */
public class AuthorSection {

   private static final String ME = "AuthorSection: ";
   private static final boolean DEBUG = CONFIG.DEBUG;

   private Vector v = new Vector();     // list of Author Names
   private String[] possibleStartTags;  // possible tags that start author name
   private Vector startAuthorTags = new Vector();  // actual start sequence
   private int index=-1;                 // where we are in startAuthorTags
   private ContextSection cs = null;     // For passing off start tag
   private boolean notInTable = true;    // weird HTML-specific tag

   // grabAuthor is used to prevent picking up spurious author names (such
   // as institution) but also to force the parsing of text into an author
   // name when indicated.  It is used to handle multi-line author entires of
   // the form <p>author<br>institution<br>email addr</P>
   private boolean grabAuthor = true;   

   public AuthorSection(ContextSection _cs, String[] st) { 
      super(); 
      possibleStartTags = st;            // document source format dependent
      cs = _cs;
   }

   public void setTable ( boolean sw ) { notInTable = sw; }

   // handleStartTag
   /**
    * Given a tag and its attributes, determine whether this could be
    * the start of a new author list.  If we are just beginning, this
    * is also where the sequence of tags that starts up an author
    * list is remembered.
    * @param The name of the tag that has been encountered, e.g. "p"
    * @param The AttributeList on that tag (relevant for HTML, TeX)
    * @returns true if we should stay in the author section, false
    * if we should move on to doing the body of the document
    */
    protected boolean handleStartTag( String name, AttributeList attrs ) {
       if (DEBUG) System.err.println(ME+"scanning for authors, got a <"
	   + name + ">, index is " + index);

       // do we have any authors yet?
       if ( v.size() > 0 ) {               // yes, we have at least one author
          if ( index>=0 && name.equals(startAuthorTags.elementAt(index) ) ) {
             index++; grabAuthor=true; return true;
          } else {
             // ignore random tags that don't match the start sequence
	     if ( name.equals("br") ) { grabAuthor=false; return true; }
             if ( index == -1 ) { // have we backed out of author env yet?
		if ( startAuthorTags.size()>0 && 
		     name.equals(startAuthorTags.elementAt(0)) ) {
		// Looks like we are starting up a new author
		  index++; grabAuthor=true; return true;
		} else {
		// Have finished the last author; this is something else
		   if ( DEBUG ) 
                   System.err.println(ME+"Author start sequence tag aborted."
                      + "Tag is <" + name + ">, index is " + index
		      + ", sequence is " + startAuthorTags.toString() );
                   return !(notInTable && cs.handleStartTag(name,attrs));
		}
	     }
	     return true;
          }
       } else {                            // no, still looking for 1st author
          if ( index == -1 ) {  // start building the tag sequence
             // is this tag one of the possible ones that can start an
             // author tag sequence?
             int i = 0;
             while ( i < possibleStartTags.length 
                     && name != possibleStartTags[i] ) i++;
             if ( i == possibleStartTags.length ) {
                if ( DEBUG ) System.out.println (
                   "possible anomaly - tag <" + name 
                   + "> cannot start an author");
             } else { startAuthorTags.addElement(name); index++; }
             return true;
          } else {             // add onto the tag sequence
             startAuthorTags.addElement(name); index++; return true;
          }
       }
    }                 // handleStartTag

    // handleEndTag
    /**
     * handles end tag.  Basically back out of properly nested tags.
     * When we've backed out all the way, then we can consider tags that
     * start new authors or a Context Section.
     * @param the name of the element or environment that is being ended.
     */
    protected void handleEndTag ( String name ) {
       if ( index >= 0 &&
          name.equals ( startAuthorTags.elementAt( index ) ) ) index--;
       if (notInTable) cs.handleEndTag(name);
    }

    // handleText
    /**
     * Given a string of text, parse it as possibly being the start of the
     * document body, else -- if grabAuthor is true -- parse it as an author
     * name or author list.
     * @param The string of text, including newlines and whitespace
     * @returns true if we should stay in the author section, else false if this
     * textstring appears to be part of the body of the document.
     */

   protected boolean handleText ( 
      char[] text, int offset, int length, String textString ) {
      if ( DEBUG ) System.err.println (
         ME+"handling text <"+textString+">.  start tags = " + startAuthorTags.toString() );
      if ( grabAuthor ) {
         if ( isEndOfAuthorSection(textString) ) {
            if (DEBUG) System.err.println(ME+
                  "grabAuthor, startAuthor set false, doContexts set true");
            grabAuthor = false;
            return false;
         }
         // Following returns false if at least one author has been found
         grabAuthor = addText ( text, offset, length, textString );
      }

      return true;
   }


   /**
    * isEndOfAuthorSection examines this hunk of text, which should
    * be a header (as determined by the parser) and returns true if
    * this could be the start of the body of the text.
    * @param The hunk of text as a String
    * @returns True if this is the start of the text body
    */

   private static boolean isEndOfAuthorSection ( String textString ) {
   // check for "abstract" or "introduction" or "contents"
      String lc = textString.toLowerCase();
      return ( (lc.indexOf("abstract") != -1)
             ||(lc.indexOf("introduction") != -1)
	     ||(lc.indexOf("contents") != -1) );
   }

   /**
    * addText is given a hunk of text from the author section of a
    * paper, which is not a header as determined by the parser,
    * which is parsed into one or more authors names.
    * @param Hunk of text in char[] form
    * @param Where in the text the author string starts
    * @param How long the string is
    * @param Hunk of text in String form
    * @returns true if the parser should keep looking for an author
    * in the current section of text
    *
    * If this is the first author (i.e. v.size() = 0) then do not
    * turn off grabAuthor.  We really do want the first line of text
    * that is after the title.
    */

   private boolean addText ( 
   char[] text, int offset, int length, String textString ) {

      // Make sure text is all alphabetics, ".", or "," or " ", or "-"
      // Otherwise we have run into institutions or email, and this
      // is the end of whatever author was being processed.  But keep
      // looking if we havn't found any authors yet ...
      for (int i = offset; i < offset + length; i++ )
      if ( !Character.isLetter ( text[i] )
	   && (!Character.isWhitespace(text[i]))
	   && text[i] != '-' && text[i] != '\'' 
           && text[i] != '.' && text[i] != ',' )
      return v.size() == 0;  // keep looking or not

      // If it is ONLY whitespace or too short, reject it
      // but keep looking if we haven't gotten an author yet...
      int count = 0;
      for (int i=offset; i<offset + length; i++)
         if (Character.isWhitespace ( text[i] ) ) count++;
      if ( count == length || length < 2 ) return (v.size()==0);
      if ((textString.trim()).equals("by")) return (v.size()==0);

      // Otherwise, handle the author and return false;
      handleAuthor ( textString );
      return false;

   } // addText

   /**
     * getAuthors() returns an Author[] array out of the authors
     * seen by this AuthorSection object so far.
     * @returns Author[] array, or null if no authors were found.
     */

   public Author[] getAuthors() {
      Author[] authors = null;
      if ( v != null ) {
	 if (DEBUG)
         System.err.println(ME+"has found " + v.size() + " authors");
         if ( v.size() > 0 )
            authors = (Author[])v.toArray ( new Author[v.size()] );
	 if (DEBUG)
         System.err.println(ME+"has constructed the Author array");
      }
      return authors;
   } // getAuthors

   // =========  PRIVATE ROUTINES ================================

   // textString contains one or more authors because it was found
   // after the title and before the body of the document.  It was
   // found by the parser's coming across a new paragraph or centered
   // text while looking for authors.

   // Possible syntaxes:
   // <p>author name<br>institution<br>other stuff</p>
   // <p>author name, author name, ... and author name <br>
   //         institution<br>other stuff</p>
   // <p>author name, author name, ... <br>
   // <center>author name<br>institution<br>other stuff</center>
   // <center>author (role)<br>institution<br>other stuff</center>
   // <a href...>author name</a>

   // Reject:
   // "by" on a line by itself

   private void handleAuthor ( String textString ) {
      if ( DEBUG )
      System.err.println(ME+"in handleAuthor, with string = ->" 
	 + textString + "<-" );
      Author a = null;
      int k = textString.indexOf(" and ");
      int m = textString.indexOf(",");
      if ( m == -1 && k == -1 ) { 
      // assume that the entire string is one name unless it has
      // a trailing "(role)" on it.
	 if ( (k = textString.indexOf("(") ) != -1 ) 
	    textString = textString.substring(0,k);
	 a = new Author ( textString ) ;
         v.addElement ( a );
	 AuthorDatabase.stashAuthor ( a );
	 if ( DEBUG )
	 System.err.println(ME+"built new author " + textString );
      }
      else { // assume a comma-separated list of authors
	 int offset = 0;
	 if ( k == -1 ) k = textString.length();
	 while ( offset < k && m != -1 ) {
	    putAuthor ( textString.substring(offset, m) );
	    offset = m+1;
	    m = textString.indexOf(",", offset);
         }

	 // handle last author in a comma-ed list of authors
	 if ( k == textString.length() ) {
	    putAuthor ( textString.substring(offset) );
	    return;
	 }

	 // handle case when there is no comma before the " and "
	 // Is there more than just whitespace between "offset" and "k"?
	 while ( offset < k && 
	       Character.isWhitespace(textString.charAt(offset++)) );
	 if ( offset < k )
	    putAuthor ( textString.substring( offset-1, k ) ) ;
	 putAuthor ( textString.substring(k+5) );
      }
   } // handleAuthor

   // putAuthor is a convenient routine
   private void putAuthor ( String authorName ) {
      Author a = new Author ( authorName );
      v.addElement ( a );
      AuthorDatabase.stashAuthor ( a );
      System.err.println(ME+"built new author " + a.toString() );
   }

   // gotAuthors is a convenience routine which returns true
   // if at least one author has been located
   protected boolean gotAuthors () { return v.size() > 0; }

} // ends AuthorSection


