// bergmark - june 2000 - Reference Linking Project

package Linkable.Utility;

// MetaData - A class with various static methods for analyzing and
// generating metadata files.  See also GenXML.java.

// Modifications: 
// 2000-07-28 Did away with MIMEfile
// 2000-08-01 Add "yyyy" to list of date formats (generated OAMS date is
//            then yyyy-01-01)
// 2000-08-03 MetaData object has nothing to do with Surrogates (and BibData),
//            only Creations.  setFile() was modified to reflect this
// 2000-10-04 On the other hand, MetaData objects are instantiated only by
//            BibData objects whose Local Metadata is in a mixture of
//            dublin core and home grown.  The static methods are used
//            by decite and the XML there is Les Carr's format.
// 2000-10-04 Changed from OAMS metadata to DC.  Handle three kinds of
//            <dc:identifier>'s: http:, doi:, and urn:
// 2000-10-12 Change input parameter to getDOM to be a String (which has
//            been de-entified by deciter)
// 2000-11-21 Add the getData routine to read in XML files
// 2000-11-30 Add a getValue() routine for Element nodes
// 2001-05-30 Add a wrapper around toOAMS that returns yyyy-mm-dd if
//            a string contains a parsable date
// 2001-06-21 Fix indexing bug in containsDate()

import Linkable.API.*;

import org.xml.sax.InputSource;     // needs xml-tr2/xml.jar in $CLASSPATH
import org.xml.sax.AttributeList;   // needs xml-tr2/xml.jar in $CLASSPATH
import javax.xml.parsers.*;         // needs jaxp1.0.1/jaxp.jar in $CLASSPATH
import org.w3c.dom.*;               // needs xml-tr2/xml.jar in $CLASSPATH

import uk.ac.soton.harvester.EntityDecode;

import java.io.*;
import java.util.Vector;
import java.util.StringTokenizer;
import java.util.Date;
import java.text.SimpleDateFormat;

public class MetaData {

   private static final String ME = "MetaData: ";
   private static boolean DEBUG = CONFIG.DEBUG;

   private static EntityDecode ed = new EntityDecode();

   // variables for the "cook" method initialized at class load time
   private DocumentBuilderFactory factory;
   private DocumentBuilder domParser = null;
   private Document doc = null;

   private Creation c = null;

   // constructor
   public MetaData(Creation _c) {
      super();
      c = _c;
      factory = DocumentBuilderFactory.newInstance();
      try { domParser = factory.newDocumentBuilder(); }
      catch (Exception e){
	 if (DEBUG)
	 System.err.println(ME+"caught Exception " + e.toString()
	 + " while trying to construct a MetaData object");
      }
      if ( DEBUG )
      System.err.println(ME+" have constructed a MetaData object.");
   }

   /** 
    * setFile - gives this MetaData object a file to parse
    * cook local metadata into canonical form, and see what
    * it matches in the Creation database.  The local metadata is an
    * XML string.
    * @param - the XML file describing this work
    * Note: a Side Effect is that authors extracted from the metadata
    * are added to the Author database and to Creation c.  Creation c
    * is added to the Creation database if not previously there.
    * Note: at present this procedure is invoked only by BibData.
    */
   public void setFile ( String mf ) {
      if ( DEBUG )
      System.err.println(ME+"in setFile, this is what is already "
	 + "known about this item, as a work:\n" + c.toString() );

      if ( domParser == null ) return;  // no parser, return quietly

      // 2000-09-29: parser has been complaining about bad character
      // at the end of the file.  The bad character is x3c, or "<".
      // The problem was that mf is just a patch of XML and
      // not the whole thing.  So put mf into a complete XML file:

      mf = "<?xml version=\"1.0\"?>\n<work>\n"+mf+"\n</work>\n";

      // We do have an XML file and we do have a parser.
      
      try {     // parse the input file into a document tree
	 doc = domParser.parse ( 
	    new InputSource(
	       (InputStream) new ByteArrayInputStream(mf.getBytes()) ) );
      } catch ( Exception e ) { 
	 if (DEBUG)
	 System.err.println(ME+"caught Exception " + e.toString()
	 + " while trying to parse an XML file");
	 return;   // (the document is null; nothing to do)
      }
      if (DEBUG) {
	 System.err.println(ME+"done parsing, start cooking ...");
         System.err.println(ME+"will cook this metafile:\n"+mf);
      }
      cookFullID();
      cookTitle();
      cookAuthors();
      cookDate();
      CreationDatabase.stashCreation ( c );
   }

   /**
    * getDOM - given an XML file of metadata as a byte[] array,
    * construct a parse tree.
    * @param - String is XML data to be parsed.
    * returns Document representing the XML file.
    */
  public static Document getDOM ( String bytes ) {
      DocumentBuilderFactory f = DocumentBuilderFactory.newInstance();
      DocumentBuilder myParser = null;
      Document doc = null;
      if (DEBUG) {
	 System.err.println(ME+"in getDOM, reading item's metadata:");
         System.err.println(bytes);
      }
      try { myParser = f.newDocumentBuilder(); }
      catch (Exception e){
         System.err.println(ME+"While trying to construct a parser, "
	 + "caught Exception " + e.toString() );
         return null;
      }
      try {
         doc = myParser.parse ( new InputSource( 
            new StringReader ( bytes) ) );
	 if ( DEBUG )
	 System.err.println(ME+"got a DOM parser");
         return doc;
      } catch ( Exception e ) {
         System.err.println(ME+"caught Exception " + e.toString()
         + " while trying to parse an XML file");
         return null;
      }
  }

   // If the item's URL has not only a DOI but also a metafile, we
   // should be able to get a publication date from the metafile
   /**
    * getDate - given an XML file of metadata return contents of a date entry
    * @param - the URL of the metadata file
    * @returns the date if one is found, else ""
    */
   public static String getDate ( String url ) {
      DocumentBuilderFactory f = DocumentBuilderFactory.newInstance();
      DocumentBuilder myParser = null;
      Document doc = null;
      if (DEBUG) System.err.println(ME+"in getDate, reading item's metadata");
      try { myParser = f.newDocumentBuilder(); }
      catch (Exception e){
         System.err.println(ME+"caught Exception " + e.toString()
         + " while trying to construct a parser");
         return "";
      }
      try {
         doc = myParser.parse ( new InputSource( url ) );
      } catch ( Exception e ) {
         System.err.println(ME+"caught Exception " + e.toString()
         + " while trying to parse an XML file");
         return "";
      }
      if (DEBUG) System.err.println(ME+"done parsing, doc = " + doc );
      Node dateNode = null;
      NodeList nl = doc.getElementsByTagName ( "date" );
      if ( nl != null ) {               // we lucked out!
         if (DEBUG) System.err.println(ME+"There are " + nl.getLength() + 
            " nodes with the date tag");
         dateNode = nl.item(0);         // Get the first node tagged "date"
         Node firstChild = dateNode.getFirstChild();
         switch ( firstChild.getNodeType() ) {
            case Node.TEXT_NODE:
               if (DEBUG) System.err.println(ME+"returning <" + firstChild.getNodeValue()
                   +">" );
               return firstChild.getNodeValue();
            default:
               System.err.println(ME+"found no text under this node");
         }
      } else {                          // look for tags that contain "date"
         System.err.println(ME+"found no tags exactly equal to date");
      }
      return "";

   }

   /**
    * getValue - give the name of a tag in an Element, return the element's 
    * value.
    * @param - root Node of an XML subtree
    * @param - the String which specifies the tag name
    * returns a String which is the first value of the first tag with
    * this name (or "");
    */
    public static String getValue (Node nameNode) {

      String result="";

      if (DEBUG) System.err.println(ME+"in getValue of element named ->"
      +nameNode.getNodeName()+"<-");

      if ( nameNode.getNodeType() != Node.ELEMENT_NODE ) {
	 System.err.println(ME+"in getVAlue with node whose type is "
	 + nameNode.getNodeType() + " which is not Node.ELEMENT_NODE");
	 return "";
      }

      NodeList nl = nameNode.getChildNodes();
      if ( nl == null ) return "";

      if ( DEBUG ) System.err.println(ME+nl.getLength()
	 +" nodes in this element");
      for (int i=0; i<nl.getLength(); i++) {
         Node n = nl.item(i);  
         switch (n.getNodeType()) {
	    case Node.TEXT_NODE:
	       result+=n.getNodeValue();
	       break;
	    case Node.ENTITY_REFERENCE_NODE:
	       result+=(char)ed.lookup(n.getNodeName());
	       break;
	    default:
         }
      }
      
      if (DEBUG) System.err.println(ME+"returning ->"+result+"<-");
      return result;
   }


   /**
    * getValue - give the name of a tag in a Document, return the first value
    * @param - doc is the Document object representing an XML tree
    * @param - name -s the String which specifies the tag name
    * returns a String which is the first value of the first tag with
    * this name (or "");
    */
    public static String getValue (Document doc, String name) {

      if (DEBUG) System.err.println(ME+"in getValue of tag named ->"+name+"<-");

      if ( doc == null ) {
         System.err.println(ME+"called with a null Document");
         return "";
      }

      Node nameNode = null;
      NodeList nl = doc.getElementsByTagName ( name );
      if ( nl != null ) {               // we lucked out!
         if (DEBUG) System.err.println(ME+"There are " + nl.getLength() + 
            " nodes with the "+name+" tag");
         if ( nl.getLength() > 0 ) {
            nameNode = nl.item(0);         // Get the first node tagged "name"
            // in the Deciter, the last name is actually the value of
            // the first attribute of the node named "author"
            if ( name.equals("author")) {  // special case for Deciter
                  NamedNodeMap nnm = nameNode.getAttributes();
                  Node lastName = nnm.getNamedItem("lastname");
                  if (DEBUG) System.err.println(ME+"value of the lastname attribute is "
                      + lastName.getNodeValue());
                  return lastName.getNodeValue();
            } else {                      // usual case
               Node firstChild = nameNode.getFirstChild();
               if ( firstChild != null ) switch ( firstChild.getNodeType() ) {
                  case Node.TEXT_NODE:
                     if (DEBUG) System.err.println(ME+"returning ->" + 
                        firstChild.getNodeValue() +"<-" );
                     return firstChild.getNodeValue();
                  default:
                     if (DEBUG) System.err.println(ME+"found no text under this node");
                  }
               else {
                  if (DEBUG) System.err.println(ME+"no children for node named " + name);
                  return "";
               }
            }
         }
      } else {                          // look for tags that contain "name"
         if (DEBUG) System.err.println(ME+"found no tags exactly equal to "+name);
      }
      return "";
    }
    
   /**
    * getValues - give the name of a tag in a Document, return all values
    * @param - doc is the Document object representing an XML tree
    * @param - name -s the String which specifies the tag name
    * returns an array of Strings which are values of the first tag with
    * this name (or null);
    */

    public static String[] getValues (Document doc, String name) {

      Vector result = new Vector();

      if (DEBUG) 
	 System.err.println(ME+"in getValues of tag named ->"+name+"<-");

      if ( doc == null ) {
         System.err.println(ME+"getValues called with a null Document");
         return null;
      }

      Node nameNode = null;
      NodeList nl = doc.getElementsByTagName ( name );
      if ( nl != null ) {               // we lucked out!
         if (DEBUG) System.err.println(ME+"There are " + nl.getLength() + 
            " nodes with the "+name+" tag");
	 for ( int i = 0; i< nl.getLength(); i++) {
            nameNode = nl.item(i);         // Get the next node tagged "name"
            Node firstChild = nameNode.getFirstChild();
            if ( firstChild != null ) switch ( firstChild.getNodeType() ) {
               case Node.TEXT_NODE:
                  if (DEBUG) System.err.println(ME+"returning ->" + 
                        firstChild.getNodeValue() +"<-" );
                  result.addElement( 
		     ( firstChild.getNodeValue() ).replace('\n',' ').trim());
		  break;
               default:
                  if (DEBUG) 
		  System.err.println(ME+"found no text under this node");
               } // switch
            else 
               if (DEBUG) 
               System.err.println(ME+"no children for node named " + name);
         } // for
	 // following gives a ClassCastException
         //if ( result.size() > 0 ) return (String[])result.toArray();
	 if ( result.size() > 0 ) {
	    if ( DEBUG ) System.err.println(ME+"getValues returning "
	    + "a string array of size " + result.size() );
	    String[] values = new String[result.size()];
	    return (String[])result.toArray(values);
	 }
         if (DEBUG) 
            System.err.println(ME+"found no tags exactly equal to "+name);
         return null;
      }
      return null;
    }

    /** getAttr -
      * Accesses a specific attribute of a specific node name
      * @param  The Document containing the XML tree
      * @param  The name of the node of interest
      * @param  The name of the attribute of interest
       */
    public static String getAttr (Document doc, String name, String attr) {
       if (DEBUG) System.err.println(ME+"in getAttr for attribute named "
       + attr + " for node tag " + name);
       NodeList nl = doc.getElementsByTagName( name );
       Node nameNode;
       if ( nl != null ) {
          if ( nl.getLength() > 0 ) if (DEBUG) System.err.println
             (ME+"found "+nl.getLength()+" tags, will use first");
          nameNode = nl.item(0);
          NamedNodeMap attrs = nameNode.getAttributes();
          if ( attrs == null ) return "";
          Node attrNode = attrs.getNamedItem( attr );
          if (DEBUG) System.err.println
          (ME+"getAttr returning ->" + attrNode.getNodeValue() + "<-");
          return attrNode.getNodeValue();
       } else if (DEBUG) 
         System.err.println(ME+"found no tags exactly equal to "+name);
       return "";
    }


    public static void dumpNodes ( Document doc ) {
        System.err.println("\n"+ME+"All nodes in this Document:\n");
        NodeList nl = doc.getElementsByTagName("*");
        for (int i=0; i<nl.getLength();i++)
        System.err.println(ME+"("+i+")  "+ nl.item(i));
    }

   /**
    * containsDate is a convenience routine to rewrite an input string,
    * which contains a date, into "yyyy-mm-dd" format
    * @param the string that potentially contains a date
    * @returns the formatted date, as a string, or null if the string
    * contains no date
    */
    public static String containsDate ( String inDate ) {
      String result = null;
      SimpleDateFormat oams = new SimpleDateFormat("yyyy-MM-dd"); // oams

      // no digits ==> no date
      int i=0;
   while (i<inDate.length() ) {
      while ( i < inDate.length() && !Character.isDigit(inDate.charAt(i)) ) i++;
      if ( i == inDate.length() ) return null;

      // two digits in a row?
      if (DEBUG) {
	 System.err.println(ME+"Have a digit at postion " + i 
	    +": " + inDate.charAt(i));
      }
      if ( i >= inDate.length()-2 || !Character.isDigit(inDate.charAt(i+1))) {
	 // resume at position i+2
	 i += 2; continue;
      }

      // first digit is at position i
      if ( i <= inDate.length()-10 ) {
         String temp = inDate.substring(i,i+10);
         Date d = null;
         try { 
	    d = oams.parse(temp);
	    return temp;
         } catch (Exception e) {}
      }

      // Not in yyyy-mm-dd format. Look for 'mm/dd/yy'
      if ( i<inDate.length()+8 && (inDate.charAt(i+2) == '/') ) {
	 String temp = inDate.substring(i,i+8);
	 result = toOAMS ( temp );
	 if ( !result.equals(temp) ) return result;
      }

      // Not in mm/dd/yy format.
      if ( i<inDate.length()+4 && Character.isDigit(inDate.charAt(i+2))
	 && Character.isDigit(inDate.charAt(i+3)) ) {
	 // we have 4 digits in a row
	 if ( (inDate.charAt(i) == '1' && inDate.charAt(i+1) == '9')
	    || inDate.charAt(i) == '2' ) {
	    // we have 19dd or 2ddd - look for preceding 'JFMASOND'
	    int j = i-1;
	    while ( j >= 0 && !Character.isUpperCase(inDate.charAt(j)) ) j--;
	    if ( j < 0 ) return toOAMS(inDate.substring(i,i+4));
	    if ( j > 0 && inDate.charAt(j-1) == '/' ) {
	       // we might have July/August yyyy
	       if (inDate.charAt(j)=='A')
		  return toOAMS(inDate.substring(j,i+4));
            } else return toOAMS(inDate.substring(j,i+4) );
	 } else i += 5;  // resume after the 4 digits
       } else i += 3;  // resume after the 2 digits
   }

       return null;
    }

   /**
    * toOAMS is a convenience routine to rewrite an input string, which
    * is a date, into OAMS format, which is yyyy-mm-dd
    * @param the unformatted date, as a string
    * @returns the formatted date, as a string
    */
   public static String toOAMS (String inDate) {
      SimpleDateFormat oams = new SimpleDateFormat("yyyy-MM-dd"); // oams
      SimpleDateFormat sdf = null;

      String[] sdfFormats = 
	 {"MMM yyyy", "MM/dd/yy", "yyyy", "MMM/MMM yyyy"};

      // parse the inDate until a parse is successful
      Date d = null;
      try { 
	 d = oams.parse(inDate);
	 return inDate;
      } catch (Exception e) {}

      // inDate is not in oams format - try some other formats
      for ( int i=0; i<sdfFormats.length; i++ ) {
	 sdf = new SimpleDateFormat(sdfFormats[i]);
	 try {
	    d = sdf.parse(inDate);
	    return oams.format(d);
	 } catch (Exception e){}
      }
      if (DEBUG)
      System.err.println(ME+"in toOAMS failed to convert inDate <"
	 + inDate +">");
      return inDate;
   } // toOAMS

   // getData -
   /**
     * Reads everything in from what is presumably an XML file and
     * returns it to the caller;
     * @param the absolute pathname of the file to be read in
     * @returns The string of XML data from the file
     */
   public static String getData ( String filename ) {
      System.err.println(ME+"Reading byte data ( from " + filename + ")");
      BufferedReader fr = null;
      StringBuffer sb = null;
      File f = new File ( filename );
      int n;
      try {
         if ( f.canRead() ) {
            fr = new BufferedReader(new InputStreamReader(
               new FileInputStream ( f ) ) );
            sb = new StringBuffer();
            while ( fr.ready() ) {
               sb.append( fr.readLine() + "\n" );
            }
            fr.close();
         }
      } catch ( Exception e ) {
         System.err.println(ME+"Reading " + filename + " failed");
         return null;
      } // finally { fr.close(); }
      return sb==null?null:sb.toString();
   }



   // ====================  Private Methods ==========================

   // Now that we have switched to Dublin Core, there could be quite a
   // few identifiers, including "doi:...", "urn:...", and "http:..."
   private void cookFullID() {
      String value;
      NodeList nl = doc.getElementsByTagName ( "dc:identifier" );
      Node e,t;
      if ( nl == null ) { System.err.println(ME
         + "Something wrong - NodeList nl is null!");
	 return;
      }
      if ( c == null ) {
	 System.err.println(ME+"Something wrong - no creation to be set");
	 return;
      }
      int n = nl.getLength();
      if (DEBUG) System.err.println(ME+"looking at " + n + 
          " <dc.identifer> nodes in all");
      for (int i=0; i<n; i++) {
         e = nl.item(i);
         if ( e == null ) System.err.println(ME
	       + "Something wrong - item " + i + " is null");
         else {          // we've got a <dc:identifier>
	    t = e.getFirstChild();  // go down to the text node
            value = t.getNodeValue().trim();
            if ( value.startsWith("doi:")) {
               if (DEBUG) System.err.println(ME+"fullID node? fullID is " + value);
               c.setDoi (value.substring(4));
            } else if ( value.startsWith("urn:")) {
               if (DEBUG) System.err.println(ME+"urn node? urn is " + value);
                  c.setUrn (value.substring(4));
	          if (DEBUG) System.err.println(ME+"has set creation's Urn to "
	          + value.substring(4));
	    }
	 }
      } // end for
   }

   // TBD: don't just output the title.  output Utils.PCDATA(title)

   private void cookTitle() {
      NodeList nl = doc.getElementsByTagName ( "dc:title" );
      Node titleElement = nl.item(0);
      Node t = titleElement.getFirstChild();
      if ( t == null ) {
	 System.err.println(ME+"Something wrong - found no title");
      } else if (DEBUG) System.err.println(ME+"title node? name is " + t.getNodeName());
      handleTitle (t.getNodeValue().trim());
   }

   private void cookDate() {
      NodeList nl = doc.getElementsByTagName ( "dc:date" );
      Node titleElement = nl.item(0);
      Node t = titleElement.getFirstChild();
      if (DEBUG) System.err.println(ME+"date node? name is " + t.getNodeName());
      c.setDate (t.getNodeValue().trim());
   }

   // Authors:  <dc:creator><name> text </name></dc:creator>
   private void 
   cookAuthors() {
      NodeList nl = doc.getElementsByTagName ( "dc:creator" );
      for ( int i = 0; i < nl.getLength(); i++ ) { // author i
	 Node a = (nl.item(i)).getFirstChild();
	 if ( a != null ) 
	    handleAuthorName (a.getNodeValue().trim());
      }
      if (DEBUG) System.err.println(ME+nl.getLength()+" authors got cooked");
      //AuthorDatabase.dump();
   }

   private void handleTitle ( String text ) {
      if (DEBUG) System.err.println(ME+"in handleTitle with text: <" + text+">");
      String oldValue = c.getTitle();
      if ( oldValue == null || oldValue.equals("") ) c.setTitle ( text );
      else {
         // do a word-by-word comparison
         StringTokenizer old = new StringTokenizer ( oldValue );
         StringTokenizer current = new StringTokenizer ( text );
         int old_number = old.countTokens();
         int new_number = current.countTokens();
         if ( old_number != new_number ) {
            System.err.println(ME+" in handleTitle.  Creation has title"
            + " of "+ old_number + " tokens, metadata title has " + new_number);
	    System.err.println(ME+"existing title:" + oldValue);
	    System.err.println(ME+"metadata title: " + text);
         } else { /* same number of words, same words? */ }
      }  // else
   } // handleTitle

   // Here is where any new author names get merged into the Author database
   private void handleAuthorName ( String text ) {
      if (DEBUG) System.err.println(ME+"in handleAuthorName -- " + text);
      Author a = new Author ( text.trim() );      // build an Author
      Author v = AuthorDatabase.fetchSpecificAuthor ( a );
      if ( v == null ) { AuthorDatabase.stashAuthor(a); c.addAuthor (a); }
      // else keep old author (if equal to this one) and
      // this ones goes to Garbage Collection
      else if ( a.equals ( v ) ) {
	 if (DEBUG) System.err.println(ME+"in handleAuthorName found "
	 + "older author with same name: " + v.toString() );
         a = null; v.addSpelling(text); c.addAuthor (v); }
      else { AuthorDatabase.stashAuthor(a); c.addAuthor (a);}
      if (DEBUG) System.err.println(ME+" have added author " + text
      + " to Creation " + c.toString() );
   }

}
