// bergmark - April, 2000 - Reference Linking Project

// HTMLAnalyzer implements the RefLinkAnalyzer interface for HTML
// files.  Only one HTMLAnalyzer is instantiated per Surrogate.
// It runs JTidy over the input file, which is HTML source, and if
// that goes OK instantiates an XHTMLAnalyzer to parse the results.

// July 28: backed out DID, parameter to buildLocalMetaData has changed
// September 26, 2000: make this just a front end for the XHTMLAnalyzer
// September 29, 2000: handle case where JTidy doesn't work.
// April 16, 2001:     added exception handling
// 2001-04-18:    Corrected filenames in constructor

package Linkable.Analysis;

import Linkable.API.*;
import Linkable.Utility.*;

import java.io.*;
import java.net.*;
import java.util.Vector;

// CLASSPATH: must include path to JTidy, e.t.
//   $HOME/bergmark/public/src/tools/JTidy/src/30apr2000
import org.w3c.tidy.Tidy;

public class HTMLAnalyzer implements RefLinkAnalyzer {

   private static final String ME = "HTMLAnalyzer: "; // for diagnostics
   private static final boolean DEBUG = CONFIG.DEBUG; // debug switch

   // For converting HTML to XHTML
   private BufferedReader in = null;
   Tidy tidy = null;
   BufferedInputStream tidyIn;
   FileOutputStream tidyOut;

   XHTMLAnalyzer xa = null;

   private String pubDate = null;
   public String getDate(){return pubDate;}

   /** Constructor
     * @param is name of file that contains the HTML to be converted
     * @throws SurrogateException if the url cannot be opened for analysis
     * @returns an XHTML analyzer, after converting the HTML to XHTML
     * to make it suitable for further processing.
     */
   public HTMLAnalyzer ( String url ) throws SurrogateException {
      this ( url, url );
   }

   public HTMLAnalyzer ( String localURL, String url )
   throws SurrogateException {
      super ( );
      tidy = new Tidy();
      tidy.setXHTML(true);           // output XHTML
      tidy.setNumEntities(true);     // output numeric entities
      tidy.setDocType("omit");       // don't bother with DOCTYPE
      if (runTidy( localURL )) {
	 if (DEBUG)
         System.err.println(ME+"constructed an HTML parser");
         xa = new XHTMLAnalyzer ( );
         xa.setURL ( "file:"+CONFIG.tidyOutput, url );
      } else throw new SurrogateException("Unable to convert " + url +
	 " to XML (XHTML) for parsing.  Input HTML file too ambiguous."
	 + "See diagnostics in relevant file within Errors/ directory.");
   }


   // ==================================================
   // Required methods for the RefLinkAnalyzer interface
   // ==================================================

   // buildLocalMetaData -
   // Return an XML string that contains original text fragments of
   // bibliographic information gleaned from this archive item.
   public String buildLocalMetaData( String DOI, String pubDate, Creation c ) {
      if ( xa == null ) return "";
      return xa.buildLocalMetaData ( DOI, pubDate, c );
   }

   // buildRefList -
   // Return an array of Reference objects gleaned from this archive item.
   public Reference[] buildRefList( BibData b ) {
      if ( xa == null ) return null;
      return xa.buildRefList( b );
   }

   // buildCitationList -
   // Return a vector of Citation objects currently known for this item
   // This will involved calls on the citeref database, which is
   // indexed by document id.
   public Vector buildCitationList( String docURN ) {
      if ( xa == null ) return null;
      return xa.buildCitationList( docURN );
   }

   /**
    * getLinkedText emits XML for the linked body of the text and/or the
    * characters of the text body followed by reference-link data suitable
    * for separate presentation.  Note that the reference-link data can be
    * constructed by this routine but saved for output by the
    * getLinkedTextFinalize routine.
    * @param The array of Reference objects belonging to this Surrogate.
    * @param The net URL of the document, for a base URL
    * @throws SurrogateException if URL to be analyzed cannot be opened.
    * @returns String to be added to the XML (including new lines)
    */
   public String getLinkedText ( Reference[] refList, String url )
   throws SurrogateException {
      if ( xa == null ) return "";
      return xa.getLinkedText( refList, url );
   }

   // These entry points support getLinkedText

   /**
    * getLinkedTextInitialize sets up to generate XML for our Surrogate,
    * but not the <? xml version="1.0" ?> incantation.
    * @returns String to be added to the XML (including new lines)
    */
   public String getLinkedTextInitialize() {
      return "";
   }

   /**
    * getLinkedTextFinalize emits XML for finishing off the Surrogate
    * linked text output.  The main
    * use for this routine is to emit the linkage data elements for
    * documents that are not expressed in HTML or in XHTML.
    * @returns String to be added to the XML (including new lines)
    */
   public String getLinkedTextFinalize () {
      if (DEBUG)
      System.err.println(ME+"in getLinkedTextFinalize");
      return "";
   }

// ==============  PRIVATE METHODS ============================


   // Run JTidy on an HTML program to convert it to XHTML
   // Currently we stored the converted file in the same file
   // over and over.
   private boolean runTidy( String url ) {
      if ( DEBUG )
      System.err.println( ME + " run Tidy on url " + url );
      // get the filename at the end of this URL
      int k = url.lastIndexOf('/');
      String filename = url.substring(k);
      File f = new File(CONFIG.tidyError);
      if ( ! f.exists() ) f.mkdir();
      String errOutFileName = "Errors"+filename;
      try {
         tidy.setErrout(new PrintWriter(new FileWriter(errOutFileName), true));
         if ( DEBUG )
         System.err.println(ME+"has set tidy's error output to "
            + errOutFileName + ".");
         tidyIn = new BufferedInputStream(new URL(url).openStream() );
         if ( DEBUG ) {
         	System.err.println(ME+"has set up a Buffered Input Stream for Tidy");
         	System.err.println(CONFIG.tidyOutput);
         }
         tidyOut = new FileOutputStream(CONFIG.tidyOutput);
         if ( DEBUG )
         System.err.println(ME+"has set up a an output file for Tidy");
         tidy.parse( tidyIn, tidyOut );
	 if ( tidy.getParseErrors() == 0 ) {
            if ( DEBUG )
            System.err.println (ME+"Tidy has parsed input to tidyOutput.xml ");
	    return true;
	 } else {
	    System.err.println
	    ("\n==================\nUnable to parse this paper");
	    return false;
	 }
      } catch ( Exception e ) {
         System.err.println(ME+"for some reason failed to translate this url.");
         System.err.println(e.toString());
	 return false;
      }
   }
}
