// bergmark - April 2000 - The Reference Linking Project

// API for linking digital objects

// Change History:
//   July 27 - scrap parsing DLIB's XML metadata file, or at least
//   if there isn't one, have a plan B for getting the pub. date
// 2000-07-28: Dispense with DID's and add Javadoc comments. No MIMEfile.
// 2000-08-03: Make sure Creation for this Surrogate has pubData, URL
// 2000-10-03: Add save() to write the Surrogate to persistent store
// 2000-11-20: Add a third constructor for resurrecting an object
// 2000-12-04: Correct an "off-by-one" indexing error in the save() routine.
// 2001-03-28: Correct the filename in jtidyVersion
// 2001-0416:  Add exception
// 2001-0515:  getLinkedText should not throw an exception
//             jtidyVersion to prepend "file:" to jtidyoutput
// 2001-05-20:  Provide for a local date
// 2001-06-12:  Let the RefLinkAnalyzer supply a pubDate if necessary
// 2001-06-13: And set this pubDate into the Creation before cooking
//             local LMetadata because that involves updating the
//             Creation hashtable.
// (TBD):      Delete any previously existing Surrogate files when building
//             a new one.  Will prevent reconstruction of the reference list.

// 2001-06-23: Add getCitationList(), getReference() function and implement
//             addCitation() function. Change buildCitation() and addCitation()
//             from protected to public, called by ReferenceSection--buildRefList()
//             By: Yu Chen

// 2001-07-24: Use File.separator, and in getDOI(), don't try to parse
//             a null metadata filename.  (D. Bergmark)


package Linkable.API;

import java.util.Vector;
import java.util.Date;
import java.text.SimpleDateFormat;
import Linkable.Analysis.*;
import Linkable.Utility.*;
import java.net.*;
import java.io.*;

import org.w3c.dom.*;             // For getLinkedText, XML parser

/**
 * This class is the primary class in the Linkable.API package.
 * The Surrogate's methods comprise the Reference Linking API.
 * The public methods called by reference linking applications are:
 * constructor (there are three types), getLinkedText,
 * getReferenceList, getCurrentCitationList, getMyData, and save.
 */

public class Surrogate {

   private static final String ME = "Surrogate: "; // for diagnostics
   private static final boolean DEBUG = CONFIG.DEBUG;

   private BibData myData;         // URN and metadata for this Item
   private Reference[] refList;    // The references in this Item
   private Vector knownCitations;  // Growing list of citations
   private String myURL;           // Network address of our Item
   private String localURL;        // Address of Item to be parsed
   private String docID;           // Our "DOI"
   private String localMetaData;   // Original text fragments in this
                                   // Item corresponding to title, etc.

   // The following is not really a part of the Surrogate class - it
   // is just a place to save the publication date should it be
   // discovered before our BibData object is constructed.
   private String pubDate;

   // The following contains this Surrogate's RefLinkAnalyzer
   RefLinkAnalyzer a = null;

   /** Constructor - make a surrogate for the item at this network address
    *  @param is the String giving the network location of the item
    *  @returns a new Surrogate if the first URL is readable and
    * convertable to XHTML, otherwise null (errors printed on
    * stderr or System.err)
    */

   public Surrogate (String url) {
      this ( url, url );
   }

   /** Constructor - make a surrogate for the item at the local address
    *  specified by the first string, with the network address in the
    *  second string (needed for processing local copies of archives)
    * @param String giving the file: url of local copy of the item
    * @param String giving the network url of the item
    * @returns a new Surrogate if the first URL is readable and
    * convertable to XHTML, otherwise null (errors printed on
    * stderr or System.err)
    */

   public Surrogate ( String _localURL, String netURL ) {
      System.err.println(ME+"making Surrogate for item "+_localURL);
      myURL = netURL;
      localURL = _localURL;

      // Set up a BibData item, without a URN
      Creation myWork=null;
      try {
         docID = getDOI( myURL );           // might also set pubDate
         myData = new BibData ( docID, null );
         myWork = myData.getCreation();

      // Get an analyzer specific for this kind of item
         a = getRefLinkAnalyzer( localURL );  // might also set pubDate
      } catch (SurrogateException se) {
         errorExit ( new SurrogateException
	 ("Cannot make a surrogate. " + se.toString()) );
      }

      // Use the Analyzer to construct the remaining fields.  Insert
      // those values into the Creation template, myWork.

      // Get whatever local metadata can be parsed from this item.
      // This returns a hunk of XML containing title, authors, date,
      // synthesized urn, url of this item ... as seen in *this* item
      // If pubDate is defined, it came from the document's URL.
      // Otherwise, it is possible that analyzer construction found it
      // in <meta> elements

      localMetaData = a.buildLocalMetaData( docID, pubDate, myWork );
      if ( pubDate == null || pubDate.equals("") ) pubDate = a.getDate();
      if ( DEBUG )
      System.err.println(ME+"Local Metadata:\n"+localMetaData.toString());

      // Next, "cook" the raw data into various fields of myData's Creation
      // By parsing the local meta data.  Merges our creation with the one
      // in the database, or stores a new creation at our synthesized URN.
      myWork.setDate(pubDate);
      myData.cook(localMetaData);
      myWork.setDisplayID(myURL);

      refList = a.buildRefList( myData );
      knownCitations = a.buildCitationList(myData.getURN().trim());

      if (this.knownCitations != null )
	if (DEBUG)
        System.err.println(knownCitations.toString());
      else
	if (DEBUG)
        System.err.println("NULL citation");
   }

   /**
     * Surrogate reconstructor - reads data in from files, does not analyze.
     * At least one of the optional arguments must be specified.
     * The first non-null parameter is used.
     * @param - The directory name of the repository containing the
     *          surrogate to be reconstructed, ending with "/Surrogate"
     * @param - The URN of the surrogate to be reconstructed.  Will be
     *          looked up in the hash table.  It is an error of a surrogate
     *          for this URN already exists in memory. (TBD)
     * @param - doi of the surrogate to be reconstructed.  A default
     *          repository name will be built for it. (TBD)
     * @returns the resurrected Surrogate, or null if the local data for
     * the surrogate could not be found, or could not be read.  Errors
     * are printed to System.err (stderr).
     */

   public Surrogate ( String dir, String urn, String doi ) {
      super();
      System.err.println(ME+"Reconstructing a Surrogate");
      try { if ( dir != null ) {
	 if ( reconstruct ( dir, urn ) )
	    System.err.println(ME+"Reconstruction succeeded");
      } else if ( urn != null ) { // TBD - useFetchCreation
      } else if ( doi != null ) { // TBD - construct default filename
      } else {
	 errorExit ( new SurrogateException (
	 "Reconstructing a Surrogate did not succeed because a null "
	 + "directory was given, as in 'new Surrogate (null, x, y)'."
	 + "(second and third parameters, urn and doi, are not yet "
	 + "implemented)." ) );
      }
      } catch (SurrogateException se) {
      errorExit ( new SurrogateException
      ("Cannot make a surrogate. " + se.toString()) ); }
      this.docID = doi;
   }


   private Surrogate errorExit(SurrogateException se) {
      System.err.println("It was not possible to construct a Surrogate:\n"
      + se.toString() + "\n" );
      se.printStackTrace();
      return null;
   }

   // getLinkedText --

   /** returns an XML file which includes the original
    *  contents of the Item plus reference anchors located inside
    * a <reflink> element, for further processing.
    * @returns an XML version of the Item, or null if the source
    * for the Item cannot be found.
    */

   public byte[] getLinkedText () {
      String document = null;
      String result = "<?xml version=\"1.0\"?>\n";

      if ( a == null ) { // surrogate resurrected - no analyzer
	 localURL = jtidyVersion (myURL);
         document = GetStream.readInputStream ( localURL );
         result = result
	       + RefLinker.getLinkedText( document, refList, myURL );
      } else {  // Surrogate constructed on fly - use analyzer
	 try {
            result = result+a.getLinkedTextInitialize() +
	                 a.getLinkedText (refList,myURL) +
                         a.getLinkedTextFinalize();
	 } catch ( SurrogateException se ) {
	    System.err.println(se); return null;
	 }
      }
      return result.getBytes();
   }

   // getReferenceList returns this item's references, in both original
   // and in cannonical form with the Santa Fe 8 fields.
   // Reference objects are immutable because they have no "set" methods
   public byte[] getReferenceList(){
      String result="<?xml version=\"1.0\"?>\n"
	 + "<referencelist xmlns:dc=\"http://purl.org/DC\" length=\"";
      if ( refList != null ) {
	 result=result+refList.length+"\">\n";
         for (int i=0; i < refList.length; i++)
	    result = result + refList[i].toString()
	       + "\n";
      }
      else result = result + "0\">\n";
      result = result + "</referencelist>\n";
      return result.getBytes();
   }

   // getCurrentCitationList returns the list of known citations of
   // this document.
   public byte[] getCurrentCitationList () {
      String result = "<?xml version=\"1.0\"?>\n";
      if ( knownCitations == null ) {
	 return new String(
	    result+"<citationlist length=\"0\"></citationlist>\n").getBytes();
      }
      else {
	 result = result + "<citationlist length=\"" +
	    knownCitations.size() + "\">\n";
	 // TBD: Iterate over Citation objects in the knownCitations Vector
	 for (int i=0; i<knownCitations.size(); i++)
	    result = result + knownCitations.elementAt(i).toString() + "\n";
	 result = result + "</citationlist>\n";
	 return result.getBytes();
      }
   }

   // getMyData returns the bibliographic data for this item,
   // which includes this item's URN and OAMS metadata.
   public byte[] getMyData() {
      return (myData.toXML("")).getBytes();
   }

   // getRefID implements Carl's "Is this one of your references?" question
   // as well as his "How do you reference this?" question
   // If the answer is no, getRefID returns null.
   public byte[] getRefID ( BibData b ) {
      // TBD return all the References in this document that might
      // correspond to the partial data contained in "b".
      return null;
   }

   // getCitationID implements Carl's "am I one of your citations?" question
   // If the answer is no, or unknown, getCitationID returns null
   public byte[] getCitationID ( BibData b ) {
      // TBD Return all of the (known) citations of this document
      // that correspond to the partial metadata contained in "b".
      return null;
   }

   // getRelatedPapers is a placeholder for methods that return
   // co-cited, co-referenced papers of this one
   public BibData[] getRelatedPapers() { return null; }

   // Return the XML string for this surrogate
   // Also return XML that contains the localMetaData like
   // the title, list of authors, and discovery data.  All this
   // should be in (Creation) myData.getCreation()
   public String toXML ( String pad ) {
      return "<?xml version=\"1.0\"?>\n"
	     + pad + "<surrogate id=\""+docID+"/Surrogate\">\n"
	     + pad + myData.getCreation().toXML(pad)
             + pad + "</surrogate>\n";
   }

   public String toString() {
      return "Surrogate for the item at " + myURL + "\n";
   }

   /**
     * save - this Surrogate should be saved to persistent store
     * as several XML files.  As a side effect update the fileIndex
     * in CreationDatabase
     * @param (optional) doi is the archive and item name, e.g.
     * "foo/bar" of where to write out the data for the Surrogate.
     * @return true if all went well.
     */
   public boolean save () {
     return save (docID);
   }
   public boolean save ( String doi ) {
      String dirname = CONFIG.REPOSITORY + File.separator + doi + 
        File.separator + "Surrogate";
      new File (dirname).mkdirs();
      // write out this item's metadata
      if ( saveData (dirname + File.separator+"myData", this.toXML("").getBytes() ) &&
           saveData (dirname+File.separator+"refList", getReferenceList()) &&
           saveData (dirname+File.separator+"knownCitations", getCurrentCitationList()) ) {
      // Now that we know all went fabulously well, tell CreationDatabase
         myData.save ( doi );
	 if ( refList != null )
            for ( int i = 0; i < refList.length; i++ )
	       //refList[i].save ( doi );
               refList[i].save ( "" );
	 if ( DEBUG ) System.err.println(ME+" saved successfully");
	 return true;
      }
      System.err.println(ME+" something went wrong saving to "
         + "directory \"" + dirname + "/\"");
      return false;
   }

   /** reconstruct -
     * Reconstruct the Surrogate found in the specified location
     * @param - String giving the directory (ending with "/") which
     * contains the Surrogate files.
     * These files are read in and used to populated the fields of
     * this surrogate object.
     * throws SurrogateException if the surrogate's directory is unreadable
     */
     private boolean reconstruct ( String dir, String urn )
     throws SurrogateException {
	 File f = new File ( dir );
	 if ( !f.canRead() ) {
	    throw new SurrogateException (
	    "Fatal error in reconstruction of the Surrogate: "
	    + "directory " + dir + " not readable");
	 }
	 if ( DEBUG ) {
	 System.err.println(ME+" has read in the following file from "
	 + dir + File.separator + CONFIG.myData + ":\n");
	 System.err.println( MetaData.getData ( dir + File.separator + CONFIG.myData ) );
	 }
	 Document doc = MetaData.getDOM
	      ( MetaData.getData ( dir + File.separator + CONFIG.myData ) );
	 String myUrn = null;
	 if ( doc == null ) {
	    System.err.println(ME+" failed to reconstruct; null Document");
	    return false;
	 }
	 String[] ids = MetaData.getValues ( doc, "dc:identifier" );
	 if ( ids == null ) return false; // need one identifier
	 for (int i = 0; i<ids.length; i++) {
	 if (DEBUG)
	 System.err.println(ME+"reconstruct, identifier="+ids[i]);
	    if ( ids[i].startsWith("http:") ) myURL=ids[i];
	    if ( ids[i].startsWith("doi:") ) docID = ids[i].substring(4);
	    if ( ids[i].startsWith("urn:") ) myUrn = ids[i].substring(4);
	 }

         /**
          * if the urn in MyData file has not been changed, need to
          * rewrite the MyData file
          */
          if(urn != null && myUrn != null && !myUrn.equals(urn))
            myData = new BibData ( docID, urn);
          else
            myData = new BibData ( docID, myUrn );


	 localURL = null;  // will need to get one if getLinkedText is called
	 // TBD: fill in the rest of the metadata for this creation
	 // TBD: knownCitations

         refList =
		Reference.reconstruct (dir + File.separator + CONFIG.refList);
         Document cdoc = MetaData.getDOM
            ( MetaData.getData ( dir + File.separator + CONFIG.knownCitations) );
        if ( cdoc == null ) {
            System.err.println(ME+" failed to reconstruct citation; nullDocument");
            return true;
        }

        //reconstruct the citation

        //reconstruct the context
        NodeList cl = cdoc.getElementsByTagName("CITATION");

        if (cl == null) {
            return true;
        }

        for ( int i = 0; i<cl.getLength(); i++ ) {
            Element ref = (Element)cl.item(i);
            Element work = (Element)ref.getElementsByTagName("work").item(0);
            Creation c = Creation.doWork(work);

            Element context_list = (Element)ref.getElementsByTagName("contexts").item(0);
            Context[] contexts = Context.reconstruct(i,(Element)context_list);

            addCitation(new Citation(c, contexts, Citation.REFERENCE));
        }
	 return true;
     }

   // UTILITY METHODS USEFUL FOR OTHER MEMBERS IN THE API PACKAGE

   // Given a Reference, construct from it a new Citation and return it
   // The Citation type should be REFERENCE because we found the citation
   // among the references belonging to another work.
   // Both the Reference object and the new Citation object correspond to
   // the same work.
   public static Citation buildCitation ( Reference r ) {
    return new Citation ( r.getDocID(), r.getContexts(), Citation.REFERENCE );
  }

   // Add this Citation to our knownCitations
   public void addCitation ( Citation c ) {
      if (DEBUG)
      System.err.println(ME + "addCitation to knownCitations");
      if (this.knownCitations == null) {
        this.knownCitations = new Vector();
      }
      this.knownCitations.addElement(c);
   }

   public Reference[] getReferences() {
        return this.refList;
   }

   public Vector getCitationList() {
        return this.knownCitations;
   }


   // PRIVATE METHODS

   // getRefLinkAnalyzer -
   /** constructs an analyzer for the item, depending on its MIME type.
    *  Sad to say, MIMEtype for xhtml files is "text/html" also.
    *  2000-08-04: better to just use the file extension (sigh)
    * @param localURL is where a local copy (or reachable remote copy)
    * can be found of the item to be parsed/analyzed.
    * @throws SurrogateException if the item cannot be opened.
    * @returns the appropriate analyzer for this item, if all is well.
    */
   private RefLinkAnalyzer getRefLinkAnalyzer ( String localURL )
   throws SurrogateException {
      if (DEBUG)
      System.err.println(ME+"in getRefLinkAnalyzer...");
      // if MIME type is HTML, run JTidy on the HTML file, and
      // then use an XHTMLAnalyzer
      String MIMEtype = getHEAD ( localURL );
      if (DEBUG)
      System.err.println(ME+"URL has MIMEtype " + MIMEtype);
      if ( MIMEtype.equals("text/html") ) {
	 // tell it what URL to parse
	 HTMLAnalyzer ha = new HTMLAnalyzer( localURL, myURL );
	 return ha;
      }
      else if (MIMEtype.equals("text/xml") ) {
	 XHTMLAnalyzer ha = new XHTMLAnalyzer();
	 try {
	 ha.setURL ( localURL, myURL );   // causes some initial parsing
	 } catch (SurrogateException se) {
	    throw new SurrogateException("Unable to construct the "
	    + "appropriate parser for item " + localURL);
	 }
	 return ha;
      }
      /* RiggedAnalyzer needs LOTS of fixes
      // if just debugging
      else return new RiggedAnalyzer ( ) ;
      */
      else return null;  // just for the compiler
   }

   // Get the MIMEtype of the document at the given URL
   // Map ".html" and ".htm" (and others) to "text/html"
   // Map ".xml" and ".xhtml" to "text/xml"
   // Otherwise simply return the content type
   private String getHEAD ( String url ) {
      String result;  Date date;

      String extension = url.substring ( url.lastIndexOf( ".") );
      if ( extension.equals(".xml") || extension.equals(".xhtml") )
          return "text/xml";
      if ( extension.equals(".html") || extension.equals(".htm") )
	  return "text/html";

      URLConnection uc = null;
      try {
	 uc = new URL(url).openConnection();
	 result = uc.getContentType();
	 long x = uc.getDate();
	 if ( x > 0 ) {
	    String urlDate = new SimpleDateFormat("yyyy-mm-dd").format(
	       new Date(uc.getDate()));
	    if (DEBUG)
	    System.err.println(ME+"contents of url's date: " + urlDate);
	    if ( pubDate == null || pubDate.equals("") ) pubDate = urlDate;
	 }
	 if (DEBUG)
	 System.err.println(ME + "contents of url's content type: " + result);
	 return result;
      } catch (IOException e) {
	 System.err.println(ME+"Problem opening the URL");
	 return "";
      }
   }

   // Sometimes it is possible to derive a DOI from an item's URL
   // The second element in the vector could be the URL of the item's metadata
   // The third element in the vector could be a String giving publication date
   // July 26: parsing the metadata is too unreliable.  Try URL, too.
   // Throws SurrogateException if the remoteURL cannot be decoded
   private String getDOI ( String remoteURL ) throws SurrogateException {
      // Try to get a URN for this url;
      Vector v = DecodeURL.decode (remoteURL);
      String s = (v==null)?"0":Integer.toString(v.size());
      String work = null;
      if ( DEBUG )
      System.err.println(ME+"DecodeURL.decode returned a vector of length "+s);
      String xmlUrl="";
      if ( v != null ) {
         work = (String)v.elementAt(0);
         if ( v.size() > 1 ) {
	    xmlUrl = (String)v.elementAt(1);
	    if (DEBUG)
	    System.err.println(ME+"metadata file thought to be at " + xmlUrl);
            /*
	    if ( xmlUrl != null ) {
	       pubDate = MetaData.getDate(xmlUrl);
	       // Put it into OAMS format
	       pubDate = MetaData.toOAMS(pubDate);
            }
            */
	 }
	 if ( pubDate == null || pubDate.equals("") ) {
	    if ( v.size() > 2 ) pubDate = (String)v.elementAt(2);
         }
	 if (DEBUG)
         System.err.println(ME+"got a hypothetical pub date of <"
            + pubDate + ">");
      }
      return work;
   }

   // Conditions upon entry:
   public boolean saveData ( String filename, byte[] data ) {
      System.err.println(ME+"Writing byte data ( " + data.length +
      " bytes) to " + filename );
      FileWriter fw = null;
      try {
	 File f = new File ( filename );
	 if ( f.createNewFile( ) ) {
	    System.err.println(ME+" file exists.  Overwriting...");
	 }
	 fw = new FileWriter ( f );
	 fw.write ( new String (data) );
	 fw.close();
      } catch ( Exception e ) {
	 System.err.println(ME+"Writing " + filename + " failed");
	 return false;
      }
      return true;
   }

   private String jtidyVersion ( String url ) {
      // Check the MIME type of the url - if "text/html" convert
      // to xml and return the local url (TBD)
      return "file:"+CONFIG.tidyOutput;

   }

}

