// bergmark - June 2000 - Reference Linking Project

// Decite -- turns a reference string into a Creation
// The reference tag has been removed from the string.
// The first bit should be the first author.

// Modifications:
// 2000-08-02: if deciter comes back with an Author whose last name is empty,
//             don't add it to the Author database
// 2000-10-12: DeEntify the XML file returned by deciter, so that it will
//             DOM-parse.  Change parameter to getDOM to be a String.
// 2000-10-18: Set FirstNameFirstHint to true (fixed 10/27)
//             Do not accept "and others" as an author
// 2000-11-21: Adding the XML decl to the string passed to MetaData.getDOM
//             is now the responsibility of the caller

package Linkable.Analysis;

import Linkable.API.Creation;
import Linkable.Utility.MetaData;           // for getDoc, getValue
import Linkable.Utility.CreationDatabase;   // for fetchCretion
import Linkable.Utility.AuthorDatabase;     // for stashAuthor, fetchAuthor
import Linkable.Utility.Author;             // for Author objects

// following needs xml-tr2/xml.jar in the $CLASSPATH
import org.w3c.dom.*;               // for Document, etc.

import java.io.*;                   // for PrintWriter

import uk.ac.soton.harvester.Deciter;
import uk.ac.soton.harvester.EntityDecode;

public class Decite {

   private static final String ME = "Decite: ";
   private static final boolean DEBUG = false;

   // "-v" turns debugging on, produces a file "deciter.err"
   // "-H" turns on FirstNameFirstHint
   private static String[] opts = {"-v","-H"};
   private static Deciter myDeciter = new Deciter ( null, opts );
   private static EntityDecode ed = new EntityDecode();

   // If all goes well, DLS will parse the reference string, and here
   // we will come up with a Creation that corresponds to the reference
   // string.  If "null" is returned, that means that the parse failed.

   public static Creation parse ( String inReference ) {

      Creation c = null;

      // Parse the reference string into a DOM tree
      Document doc = getDocument ( inReference );

      // Using (known) elements from the tree construct a key for the
      // Creation database

      String yearValue = null;
      String titleValue = null;

      if ( doc == null ) {
	 if (DEBUG)
	 System.err.println(ME+"could not parse the output from DLS");
      } else {
         yearValue = MetaData.getValue(doc, "year");
         String s;  // just the year
         if ( (s=yearValue).equals("")) s = "";
	 else s = s.substring(s.length()-4);
         titleValue =  MetaData.getValue ( doc, "title" );

        String key = Creation.synthesizeURN(MetaData.getValue(doc, "author"), s, titleValue );
        key = key.trim();
	 if (DEBUG)
         System.err.println(ME+"hashed DOM provides key ->" + key + "<-");

         // Look up this work in the Creation database

         // Return a Creation initialize to this synthesized key
         c = CreationDatabase.fetchCreation ( key );
         boolean newCreation = false;
         if ( c == null ) {

            // check in the creation database to see if there are any
            // merge of urn can be done
            String author = MetaData.getValue(doc, "author");
            if(key.indexOf('*') != -1) {
              // there is "*" in key
              String mergableCreation;
              if(author != null) {
                mergableCreation = CreationDatabase.mergeCreation(
                    key, Math.min(10, author.length()));
              }
              else {// the author part is "*"
                mergableCreation = CreationDatabase.mergeCreation(key, 0);
              }
              if(mergableCreation != null) {
                // merge the two urns
                key = mergableCreation;
                c = CreationDatabase.fetchCreation(key);
                yearValue = c.getDate();
                titleValue = c.getTitle();
              }
              else {
		if (DEBUG)
                System.err.println(ME+" adding a new Creation to the database");
	        c = new Creation ( );
	        newCreation = true;
              }
            }
            else { // there is no "*" in sourceURN
              String oldUrn;
              int authorLength = Math.min(10, author.length());
              if((c = CreationDatabase.fetchCreation(
                  "*" + key.substring(authorLength))) != null) {
                // there is creation in Database can merge with sourceURN without authors
                NodeList n = doc.getElementsByTagName("author");
                c.setAuthors(getAuthorList(n));
                oldUrn = c.getURN();
                c.setUrn(key);
                CreationDatabase.update(key, oldUrn);
              }
              else if((c = CreationDatabase.fetchCreation(
                    key.substring(0, authorLength) + "*" + key.substring(authorLength+4))) != null) {
                // there is creation in database can merge with sourceURN without date
                c.setDate(yearValue);
                oldUrn = c.getURN();
                c.setUrn(key);
                CreationDatabase.update(key, oldUrn);
              }
              else if((c = CreationDatabase.fetchCreation(
                  key.substring(0, authorLength+4) + "*")) != null) {
                // there is creation in database can merge with sourceURN without title
                c.setTitle(titleValue);
                oldUrn = c.getURN();
                c.setUrn(key);
                CreationDatabase.update(key, oldUrn);
              }
              else {
		if (DEBUG)
                System.err.println(ME+" adding a new Creation to the database");
	        c = new Creation ( );
	        newCreation = true;
              }
          }
         } else {
	    if (DEBUG)
	    System.err.println(ME
	       +"this work is already in the Creation database");
         }

         // Using information from the reference's Document, flesh out the
         // information in this creation.  Also create new authors where
         // necessary.  Add this creation to their lists if it is s new one.

         c.setDate(yearValue);
         c.setTitle(titleValue);
         c.setDisplayID(MetaData.getValue(doc,"displayID"));
         // Construct an array of Authors and add that
         NodeList nl = doc.getElementsByTagName("author");
	 if (DEBUG)
         System.err.println(ME+"This creation has " + nl.getLength() + " authors");
         c.setAuthors(getAuthorList(nl));  // getAuthorList updates author DB.

         // set/reset the URN if not already set to be key
         if ( newCreation ) c.setUrn ( key );

         // store or overwrite this creation in the database
         CreationDatabase.stashCreation ( c );
      } // end of doc != null

      return c;
   }

   // getDocument turns a reference string into a DOM object.
   private static Document getDocument ( String inReference ) {
      /* this gets an instantiation error, although the class is found OK
      myDeciter.setCitationOutput("uk.ac.soton.harvester.TXTOutput");
      */
      if (DEBUG)
      System.err.println(ME+" in parse, with reference string -->"
	 + inReference + "<--");
      // the two empty strings are for page number and word number
      /*  Ultimately we want to capture the XML output
      myDeciter.dodecite(inReference,"","",new PrintWriter(System.err));
      */
      ByteArrayOutputStream baos = new ByteArrayOutputStream();
      PrintWriter pw = new PrintWriter(baos);
      myDeciter.dodecite ( inReference, "", "", pw);
      pw.flush();  // necessary to get output for this one reference
      if (DEBUG)
      System.err.println("\n"+ME+"result:\n" + baos.toString());

      // DOM parser spits up on (some?) entified character
      String sbaos = deEntify (baos);

      // parse baos into a DOM tree (a static utility routine)
      Document doc = MetaData.getDOM ("<?xml version=\"1.0\" ?>\n" + sbaos);
      /*
      System.err.println(ME+"back from getDOM with doc = " + doc.toString() );

      MetaData.dumpNodes(doc);
      */
      return doc;
   }

   // getAuthorList - extract an Author[] array from a NodeList
   private static Author[] getAuthorList ( NodeList nl) {
      Node n = null;                         // a node in the node list
      if ( nl == null ) {
	 if (DEBUG)
         System.err.println(ME+"in getAuthorList, NodeList is null");
	 return null;
      }
      int k = nl.getLength();
      if (DEBUG)
      System.err.println(ME+"in getAuthorList with a NodeList of " + k
	 + " elements");
      if ( k == 0 ) return null;

      Author[] result = new Author[k];

      // Using Southampton's Deciter, you get <author> nodes with
      // attributes "lastname" and "initials".  Often (as in the case of institutions)
      // last name comes back "" or null.  Don't add these to the Author database.
      // Also there are cases were et al. are still not handled correctly
      // and the last name becomes "al".  Also, don't store "and others"

      for (int i=0; i<k; i++ ) {
	 n = nl.item(i);
	 NamedNodeMap nnm = n.getAttributes();
	 Node lastName = nnm.getNamedItem("lastname");
         Node initials = nnm.getNamedItem("initials");
         result[i] = new Author(initials.getNodeValue()
	    +" "+lastName.getNodeValue());
	 if (DEBUG)
	 System.err.println("("+i+") "+result[i].toString());

	 // Add this new author to the author database
	 if ( lastName.getNodeValue() != null
	    && !lastName.getNodeValue().equals("and")
	    && !lastName.getNodeValue().equals("others")
	    && !lastName.getNodeValue().equals("al")
	    && !lastName.getNodeValue().equals("") ) {
	    Author v = AuthorDatabase.fetchSpecificAuthor ( result[i] );
	    if ( v == null ) { AuthorDatabase.stashAuthor ( result[i] ); }
	    // else keep old author - this one goes to Garbage Collection
	    else { if ( result[i].equals ( v ) ) result[i] = v;
	        else { AuthorDatabase.stashAuthor(result[i]); }
	    }
	 }
      }

      return result;
   }

   // TBD: handle all of the special ones using Les' hashtables.
   private static
   String deEntify ( ByteArrayOutputStream baos ) {
      String work = baos.toString();
      int i,j;
      /*
      while ( ( i = work.indexOf("&rsquo;") ) != -1 ) {
	 if (DEBUG)
	 System.err.println(ME+"deEntify found an entity: &rsquo;");
	 //work = work.substring(0,i)+"'"+work.substring(i+7);
	 work = work.substring(0,i)+"&apos;"+work.substring(i+7);
      }
      while ( ( i = work.indexOf("&uuml;") ) != -1 ) {
	 if (DEBUG)
	 System.err.println(ME+"deEntify found an entity: &uuml;");
	 work = work.substring(0,i)+"&#xfc;"+work.substring(i+6);
      }
      while ( ( i = work.indexOf("&auml;") ) != -1 ) {
	 if (DEBUG)
	 System.err.println(ME+"deEntify found an entity: &auml;");
	 int n = ed.lookup("auml");
	 String ent = "&#x"+Integer.toString(n,16)+";";
	 work = work.substring(0,i)+ent+work.substring(i+7);
      }
      */
      i = work.indexOf("&");
      while (  i != -1 ) {
	 j = work.indexOf(";",i);
	 // if j == -1, we had a very spurious &
	 if (j == -1)i++;
	 else {
	    if (DEBUG)
	    System.err.println(ME+"deEntify found an entity: "
	       + work.substring(i,j+1));
	    int n = ed.lookup( work.substring(i+1,j) );
	    String ent = "&#x"+Integer.toString(n,16)+";";
	    work = work.substring(0,i)+ent+work.substring(i+6);
	 }
	 i = work.indexOf("&",j+1);
      }
      return work;
   }

}
