// bergmark - April 2000 - Reference Linking Project

package Linkable.Utility;

// Author - a convenience class for manipulating and matching
// names of authors

// Updates:
// 2000-10-18:  Add two remaining accessor functions
// 2000-10-20:  Fixed bug whereby equals() would return true if
//              the author names were not equal (sigh)
// 2000-11-27:  Make the "equals" routine more complete; add reconstruct
//              routine.  Finish reconstruct, 11-30.
// 2000-12-07:  Fixed merge to not blindly getting tokens without
//              checking that there are some available first.
// 2000-12-08:  Made more logic repairs to the merge routine.

import java.util.Vector;
import java.util.StringTokenizer;
import java.io.Serializable;

import org.w3c.dom.*;

/**
 * The Author class is a utility for parsing author names into an Author
 * structure, for returning parts of names, printing names, and testing
 * author names for equality.
 */
public class Author implements Serializable {

   // Name fields are terminated with a blank, except for the last one
   // Name fields are:  first von last jr
   private Vector names=new Vector(); // Strings: ways of spelling a name
   private String first = "";         // canonical first and middle names
   private String last = "";          // canonical last name
   private String von = "";           // things like "van der" but not "et al"
   private String jr="";              // Things like ", Sr."
   private String et="";              // et al.
   private String institution="";     // Author's institution

   private static final boolean DEBUG = false;
   private static final String ME = "Author: ";

   /**
    * Constructor
    * Note that the constructor just creates an Author object; it does
    * NOT add the newly constructed object to the Author database.
    * This is so that clients of this class can construct an Author and
    * then see if there is already a similar one in the database.
    */
   public Author ( String authorName ) { 

      if (DEBUG) System.err.println(ME+" in Constructor");
      names.addElement ( authorName );

      // parse the string into its fields and populate Author object

      StringTokenizer st = new StringTokenizer(authorName, "., \n", true);
      int nCommas = countCommas (authorName);
      char state = 'a';   // we are going to run a finite state machine
      while (st.hasMoreTokens()) {
	 state = trans ( nCommas, state, st.nextToken() );
      }

      // special case (kludge)
      if ( ! first.equals("") && last.equals("" ) ) {
	 last = first; first = "";
      }
      if ( ! first.endsWith(" ") ) first = first + " ";
      if ( DEBUG ) System.err.println("End of tokens");
      if ( DEBUG ) {
	 System.err.println(ME+"first name = " + first);
	 System.err.println(ME+"last name = " + last);
	 System.err.println(ME+"von = " + von);
	 System.err.println(ME+"jr = " + jr );
	 System.err.println(ME+"et = " + et);
      }
      if (DEBUG) System.err.println(ME+" leaving constructor");
   }

   // Question: should we also keep a list of title keys like ResearchIndex
   // does?  I suspect so, since it could serve to convince us that this is
   // indeed the same author.  E.g. hash key could be last name + first
   // 20 letters in significant title words.  Or store title separately.

   // Using MySQL to support external database: for a driver, get mm
   // from http://www.mysql.com/Contrib (it is under the GPL license)
   // Current version is 1.2

   // set functions for everything, e.g. add new name
   public void setFirstName (String text) { first = text; }
   // Warning: if application maintains a database of authors keyed
   // on the last name, the application had better make sure to update
   // the author database if the key has changed.
   public void setVon (String text) { von = text; }
   public void setLastName (String text) { last = text; }
   public void setJr (String text) { jr = text; }

   // get functions for everything.
   public String getLastName () { return last+jr; }
   public String getFirstName () { return first; }
   public String getVon() { return von; }
   public String getJr() { return jr; }

   /**
    * equals - determines whether two authors are probably the same author.
    * @param - the Author to be compared with this one.
    * @returns true if these are the same authors
    * NOTE: side effect on authors first names - the parameter might get
    * its first name filled out by contents of it in this object instance
    * and/or the first name of this instance might get filled in
    */
   public boolean equals ( Author a ) {
      if ( ! last.equals (a.getLastName()) ) return false;
      if ( ! von.equals (a.getVon()) ) return false;
      if ( ! jr.equals (a.getJr()) ) return false;
      String otherFirstName = a.getFirstName();
      if ( ! first.equals ( otherFirstName ) ) {
	 if ( otherFirstName == "" ) {
	    a.setFirstName(first);
	    return true;
	 }
	 else return merge ( a, otherFirstName );
      } 
      return true;
   }

   // add another spelling of the author name to this object
   public void addSpelling ( String text ) {
      names.addElement ( text );
   }

   // Given a string that "might" contain peoples names, return Author[]
   public static Author[] decode ( String text ) {
      Author[] authors = null;
      Vector v = null;
      System.err.println(ME+"in decode with string <" + text + ">");
      // TBD
      return authors;
   }

   // PRIVATE FUNCTIONS

   // Finite State Machine for parsing Author Names.  Many thanks to
   // Gregory Ward for his writeup of Text::BibTex::Name in CPAN.
   // Number of commas:
   // 0 Name is first+ von* last [jr]  or  last first depending on length
   // 1 Name is von* last [jr], first+ 
   // 2 Name is von last, jr, first+
   // >2 Name is name, institution.  Parse the name as if only the first
   // two commas were present.  The rest is the institution
   // special words: "St.", "Jr", "Sr.", "et al." etc.
   // Single letters or single letters followed by a . are initials
   // and belong in the first name.
   private char trans ( int nCommas, char state, String s ) {
       if ( DEBUG ) System.err.println("Author:trans - "+" " + state + " " + s);
       char c = s.charAt(0);            // s is a complete token
       if ( Character.isWhitespace(c) ) return state;    // skip over blanks
       switch ( state ) {
	  case 'a':  // initial state
	     if ( Character.isLowerCase ( c ) ) { von = s; return 'm'; }
	     if ( c == '.' | c== ',' ) return state;
	     if ( nCommas == 0 ) { first = s; return 'b'; }
	     else if (s.length()>1){last = s; return 'c';}
	     else {first = s; return 'b';}
	  case 'b': // We have first name, looking for second
	     if ( c == '.' ) {first=first+"."; return 'h';}
	     else if ( Character.isLowerCase(c) ) { 
		first = first + " "; von=s; return 'j'; }
	     else { // have a last name if > 1 character
		if ( s.length() > 1 ) { 
		   first = first + " "; last = s; return 'i'; 
		} else {  // could have been [last, first-initial.]
		   if ( nCommas == 0 ) { first = first + " " + s; return 'b'; }
                   last = first + " "; first = s; return 'e';
		}
             }
	  case 'c': //  Have [Last], waiting for a comma
	     if ( c == ',' ) return 'd';
	     // Add the period, von, or Name.  Last name could be [Name Jr], 
	     // or [Name von] as in Brie von Beck or [Name Jr]
             last = last + " " + s;
             return state; 
	  case 'd': // Have [Last,] scanning first or jr
	     if ( c == '.' | c == ',' ) return state;
	     if ( nCommas == 1 ) { first = s; return 'e'; }
	     else if (s.equals("et")){et = "et al."; return 'q';}
	     else { jr = s; return 'r'; }
	  case 'e': // No more commas are expected.
		    // Have [last, first]. Scan for end of first
	     if ( c == '.' ) {first=first+"."; return 'f';}
	     else if ( c == ',' ) return 'q'; // must be institution
	     else if ( s.equals("et")) 
		{et="et al.";return 'q';} // et al. ends name
	     else { first =  first + " " + s; return 'e'; }
	  case 'f': // ncommas = 1.  Have [Last, I.]
	     if ( c == ',' )  return 'q'; 
	     else if ( c == '.' )  return state;
             first = first + " " + s; return 'e';
	  case 'g': // Have name ','.  Scanning institution - not yet used
	     if ( institution == "" ) institution = s;
	     else if ( c == '.' | c == ',') { institution = institution+c;}
	     else institution = institution + " " + s;
	     return state;
	  case 'h': // no commas, have [I.] 
	     if ( Character.isLowerCase ( c ) ) {
		first = first + " ";
		von = s; 
		return 'j'; 
	     }
	     else if ( c == '.' ) return state; // ignore eroneous .
	     else { first = first + " "; last = s; return 'i'; }
	  case 'i': // 0 or more commas, have [First Last]
	     if ( c == '.' ) { 
		if ( last.equals("St") ) { last = last + "."; return 'n'; }
		else { first = first + last + ". "; last = ""; return 'h'; }
	     }
	     else if ( c == ',' ) return state;
	     else if ( isJr ( s ) ) { last = last + " "; jr = s; return 'l'; }
	     else if (s.equals("et")) { et="et al."; last += " "; return 'q'; }
	     else if ( Character.isLowerCase ( c ) ) { 
                first = first + last + " ";
		last = ""; 
		von = s; return 'j'; }
	     else newLastName ( s ); return state;
	  case 'j': // no commas, have [First von]
	     if ( Character.isLowerCase ( c ) ) { 
		von = von + " " + s; return state; }
	     else if ( c == '.' ) { von = von + "."; return state; }
	     else von = von + " "; last = s; return 'o';
	  /*
	  case 'k': // no commas, have name = [I. J.] Same as state 'h'?
	     if ( c == '.' ) { return state; } // Error - ignore character
	     if ( Character.isLowerCase ( c ) ) { von = s; return 'j'; }
	     last = s; return 'i';
	  */
	  case 'l': // no commas, have = [I. Last jr]
	     if ( c == '.' ) jr = jr + '.'; 
	     return 'q';  // done with name.  Other things preclude '.'
	  case 'm': // 1 comma, have [von] 
	  case 'n': // no commas, have name = [First St.]
	     if ( c == '.' | Character.isLowerCase(c) | c == ',' ) return state;
	     last = last + " " + s; return 'o';
	  case 'o': // no commas, [I. von Last] | [First St. Last]
	     if ( c == '.' ) {
		if ( last.equals("St") ) {last = last+"."; return 'n'; }
		else return 'q';  // accept no more tokens. Not last initial.
	     }
	     if ( Character.isLowerCase(c) ) return 'q';
	     if ( isJr (s ) ) { last = last+" "; jr = s; return 'p'; }
	  case 'p': // no commas, [First von Last Jr]
	     if ( c == '.' ) { jr = jr + "."; return 'q'; }
	     return 'q';  // accept no more tokens for all else?
	  case 'q': // no commas, name is complete, toss tokens away
	     // Question: how about appending them to institution instead?
	     return state;
	  case 'r': // 1 or more commas. Scanned "Name, Jr".  Have [Name Jr]
	     last = last + " ";
	     if ( c == '.' ) { jr = jr + "."; return 's'; }
	     if ( c == ',' ) { jr = jr + "."; return 't'; }
	     first = s; return 'e';
	  case 's': // 1 or more commas, Have [Name Jr.]
	     if ( c == '.' ) return state;  // ignore duplicate .
	     if ( c == ',' ) return 't';
	     first = s; return 'u';
	  case 't': // 2 or more commas. Scanned "Name, Jr," Have [Name Jr.]
	     if ( c == '.' ) return state; // ignore bad .
	     if ( c == ',' ) return 'q'; // accept no more tokens
	     first = s; return 'e';
	  default:
	     if ( DEBUG ) System.err.println ("Author:trans - I am stumped");
       }
       return state;
   }

   // scan for jr 
   private boolean isJr ( String s ) {
      if ( s.equals("II") | s.equals("III") | 
      s.equals("Jr") | s.equals("Sr") ) return true;
      else return false;
   }

   // Current last name is really part of the first and middle names; 
   // String s is new last name.
   private void newLastName ( String s ) {
      if ( first == "" ) {  // never had a middle name
	 first = last;
      }
      else {                 // append current last to middle
	 first = first + " " + last;
      }
      last = s;             // and reset last to current string
   }

   private int countCommas ( String s ) {
      int result = 0;
      int pos = 0;
      while ( -1 != (pos = s.indexOf(',',pos+1)) ) result++;
      if ( DEBUG ) System.err.println("Author:countCommas returning " + result);
      return result;
   }

   // String representation for this author's name
   public String toString () {
      return (String) first + von + last + jr + et + "\n";
   }

   // Return the dc:Author XML for this author
   public String toXML ( String pad ) {
      return "<dc:creator>"+first+von+last+jr+et+"</dc:creator>\n";
   }

   // merge - if this Author instance can be merged with the specified
   // author (by expanding the first name) do it and return true
   // NOTE: neither this first name or give "aFirstName" are ""
   private boolean merge ( Author a, String aFirstName ) {
    int j=0; int k=0;       // substring replacement in aFirstName
      if ( DEBUG ) System.err.println (ME
      +" in merge with first name ->" + first + "<- vs. input name ->"
      + aFirstName + "<-");
      StringTokenizer itoken = new StringTokenizer ( first, ". ", true);
      StringTokenizer jtoken = new StringTokenizer ( aFirstName, ". ", true);
      StringBuffer newFirst = new StringBuffer(first);
      StringBuffer newFirstName = new StringBuffer(aFirstName);
      try {
      while ( itoken.hasMoreTokens() && jtoken.hasMoreTokens() ) {
	 String iword = itoken.nextToken();
	 String jword = jtoken.nextToken();
	 if ( iword.equals(".") ) iword = itoken.nextToken();
	 if ( jword.equals(".") ) jword = jtoken.nextToken();
	 if (DEBUG) System.err.println(ME+"itoken, jtoken " + iword +", "
	    + jword+".  " + newFirst.toString()
	    + ",  " + newFirstName.toString());
	 if ( iword.equals(jword) ) { // advance over word in both buffers
	    j += iword.length();
	    k += iword.length();
	 }
	 // assert neither are " " or "."
	 else if ( jword.length() == 1 && iword.startsWith ( jword ) ) {
	     // replace "jword." with iword
	     newFirstName.replace(j,j+1,iword);
	     j+=iword.length();
	     k+=iword.length();
             if ( newFirstName.charAt(j) == '.' ) {
		  newFirstName.deleteCharAt(j);
		  jtoken.nextToken();
	       }
	 } else if (iword.length() == 1 && jword.startsWith ( iword ) ) {
	       // replace "iword." with jword
	       newFirst.replace(k,k+1, jword);
	       j+=jword.length();
	       k+=jword.length();
	       if ( newFirst.charAt(k) == '.' ) {
		  newFirst.deleteCharAt(k);
		  itoken.nextToken();
	       }
	 } else return false; // we have two words and they are unequal
      }
      } catch (Exception NoSuchElementException) {}
      if ( itoken.hasMoreTokens() ) {
	 while ( itoken.hasMoreTokens() ) 
	    newFirstName.append(itoken.nextToken());
      }
      if ( jtoken.hasMoreTokens() ) {
	 while ( jtoken.hasMoreTokens() ) 
	    newFirst.append(jtoken.nextToken());
      }
      setFirstName ((newFirst.toString()));
      a.setFirstName ((newFirstName.toString()));
      if ( DEBUG ) System.err.println
         (ME+"result of merge: first->"+first+", aFirstName->"
         +a.getFirstName()+"<-");
      return true;
   }

   // reconstruct -
   /** rebuilds the Surrogate from a node in a DOM tree
     * @param - the top node in the subtree containing the author
     * @returns - an Author object
     * Note: a possible side-effect is that the author will be
     * added to the author database, or that the data currently
     * in the database will be updated (this is a side-effect of
     * the "stashAuthor" routine).
     * Note: authors without a last name will not go into the database
     * but a temporary Author object is returned to the caller anyway.
     * Note: this is public because Creation might call it
     */

   public static Author reconstruct ( Node n ) {
      String aname = MetaData.getValue( n);
      Author a = new Author ( aname );
      AuthorDatabase.stashAuthor ( a );  
      // may or may not be same handle
      Author b = AuthorDatabase.fetchSpecificAuthor(a);
      if ( b!=null ) return b; else return a;
   }
}
