// bergmark - December 2000 - Reference Linking Project

package Linkable.Analysis;

// RefLinker -
/** is a static class that converts document source into linked 
* document source.
*/

// Update history -
// 12-13-2000:  Fix to insertLinks: entify the anchor before comparing to text
// 12-13-2000:  Switch to a global StringBuffer to hold the document
// 12-14-2000:  Fix ord="91" which should be ord="10".  Replace in the
// string buffer rather than insert.  Entify all attribute string values.
// Added the punt() and getURLS() routines, and complete output of the 
// <reflink> tag.
// 01-02-2001:  Close off the BASE url
// 02-09-2001: add the tag to the refLink/ord
// 03-26-2001: change url= attributes to <url> sub-elements
// 04-05-2001: Fix bug in getContexts that caused only one anchor/context
// 04-06-2001: Fix indexing in linkText, scanForward, and insertLinks
// 04-09-2001: Had gotten the <reflink> and <url> embedding wrong. Fix this.

import Linkable.API.Reference;
import Linkable.Utility.Context;

import uk.ac.soton.harvester.Utils;

import java.util.Vector;
import java.util.Enumeration;
import java.util.Hashtable;

public class RefLinker {

   private static final String ME = "RefLinker: ";
   private static final boolean DEBUG = true;

   private static StringBuffer document = null;   // will hold one xhtml doc
   private static String doc;   // will hold string version of document
   // Point relative to start of context of first character after the
   // anchor replaced by a <reflink>
   private static int kz=0; 

   // ============== STATIC ROUTINES ==========================

   // public, it is called by the Surrogate
   public static String getLinkedText 
      ( String doc, Reference[] refList, String url ) {
      document = insertBaseURL ( doc, url );
      Vector[] contexts = getContexts ( refList );
      return linkText (contexts, refList );
   }

   // ===============  PRIVATE ROUTINES =========================

   // getContexts -
   /** from a list of references extracts all the context information
   * that will be needed to reference link a source document.
   * @param The reference list (Reference[])
   * @returns an array of vectors
   */
   private static Vector[] getContexts ( Reference[] refList ) {

      if ( refList == null ) {
	 System.err.println(ME+" has no reference list to process");
	 return null;
      } 

      // key is 1-based context ordinal value
      // value is a Vector of Context objects, one per reference
      // in this context.
      Hashtable ht = new Hashtable();

      int numberOfContexts = 0;

      for ( int i=0; i<refList.length; i++ ) {
	 Reference r = refList[i];
	 Context[] contexts = r.getContexts();
	 if ( contexts != null ) {
	    for ( int j=0; j<contexts.length; j++ ) {
	       Context c = contexts[j];
	       int ordinal = c.getOrd();
	       if ( ordinal > numberOfContexts ) numberOfContexts= ordinal;
	       String ord = Integer.toString(ordinal);
	       if ( DEBUG ) System.err.println(ME+"reference "+i
		  +" has context " + ord + ", in ht = " + ht.containsKey(ord) );
	       if ( ht.containsKey(ord) ) {
		  Vector v = (Vector) ht.get (ord);
		  v.addElement(c);
		  ht.put (ord,v);
	       } else {
		  Vector v = new Vector();
		  v.addElement(c);
		  ht.put(ord,v);
	       }
	    }
	 }
      }
      if ( DEBUG ) System.err.println(ME+"has finished making a hashtable "
      + "of " + ht.size() + " elements");

      // Convert Hashtable into an array of vectors and return it.
      // Note that some elements in the array may be the null vector.
      // These are "anchors" that failed to match up with any references.
      Enumeration ordinals = ht.keys();
      Vector[] result = new Vector[numberOfContexts];
      if ( DEBUG ) System.err.println(ME+" has allocated a Vector array of "
	 + result.length + " elements");
      while ( ordinals.hasMoreElements() ) {
	 String ord = (String)ordinals.nextElement();
	 result[Integer.valueOf(ord).intValue()-1] = (Vector)ht.get(ord);
      }
      if ( DEBUG ) System.err.println(ME+"has finished making an"
	 +" array of Vectors, each containing a list of Contexts");

      return result;

   }  // getContexts

   // linkText -
   /** does the "heavy lifting" of inserting reflinks into the text,
   * while making a single pass over the text.
   * The text is kept in a StringBuffer during processing to allow
   * for efficient insertions and the like.  It is converted to a
   * string just before returning.
   * @param An ordered array of contexts, each element a Vector of
   * Context objects, one per reference contained in the context
   * @param the array of Reference objects for this document
   * @returns a String, which is the text with reflinks inserted into it
   */
   private static String linkText ( Vector[] contexts, Reference[] refList ) {
      if ( DEBUG ) System.err.println(ME+" in linkText with "
	 + (contexts==null?0:contexts.length) + " elements");
      if ( contexts == null ) return "";

      doc = new String (document).replace('\n',' ');

      int k=0, kx = 0;   // where to start scanning buffers
      for ( int i=0; i<contexts.length; i++) {
	 if ( contexts[i] != null ) {
	    int ky = scanForward ( kx, contexts[i] );
	    // ky - kx is number of bytes until where context is found
	    k = insertLinks ( k+(ky-kx), contexts[i], refList );
	    kx = ky + kz;     // skip over processed content
	 }
      }
      if ( DEBUG ) System.err.println(ME+" exiting linkText");
      return document.toString();
   }

   // scanForward -
   /** finds where in the document this context is and returns an
   * index to its position.  It is a serious error if the context
   * cannot be found, because it was taken from this document at
   * one point in the past.  However, this time the text has tags in it
   * @param integer position at which to start scanning.  This will
   * be the character following the previously replaced anchor.
   * @param Vector of Context objects, all of which are for the same
   * context, but for different anchors within the context.
   * Assert: this vector is not null.
   * @returns integer position of where the context can be found.
   * Side effect: sets "kz" to how many bytes between start of
   * search and start of the context.
   */
   private static int scanForward ( int k, Vector v ) {

      // all the context strings are the same.  Use the one at element 0
      Context c = (Context)v.elementAt(0);  

      if ( DEBUG ) {
	 System.err.println (ME+" in scanForward, position "
	 + k + ", "+v.size()+" anchors as follows");
         for (int i=0; i<v.size(); i++ ) {
	    c = (Context) v.elementAt(i);
	    System.err.println(c.getAnchor());
         }
      } 

      String s = c.getContext();
      int contextLength = s.length();
      if ( DEBUG ) {
	 System.err.println("scanForward looking in doc("+k+":-) for ->"
	 + s + "<-");
	 System.err.println("doc("+k+":-) is ->" + doc.substring(k,k+200)
	 + "<-");
      }
      int i=k;
      while ( (i = doc.indexOf(s, k)) == -1 ) {
	 if ( s.length() > 5 ) s = s.substring(0,s.length()/2);
	 else return punt ( k, c.getContext() );
	 if ( DEBUG ) 
	 System.err.println("scanForward: not found, try ->"+s+"<-");
      }
      System.err.println(ME+" in scanForward, pos " + i + ", found " 
	 + s.length() + " characters of this pattern: " + s );
      kz = i-k;
      System.err.println(ME+"scanForward set kz to " + kz);
      return i;
   }  // scanForward

   // insertLinks -
   /** Given the position in the document of a context, and a list
   * of anchors (references) in that context, stick in the <reflink>s
   * @param int position of context in document
   * @param Vector of Context objects, one per reference in this context
   * @param the array of references for this document
   * @returns The position where scanning should resume.
   * Side effect: document, the global StringBuffer, gets a <reflink>
   * inserted into it.
   * NOTE: the anchor must be entified before being grepped in the
   * JTidied-document, which is entified xhtml source.  The anchor is
   * de-entified, because it came out of a SAX-parsed tree.
   * Side effect: sets global int "kz" to point after anchor string
   * relative to the beginning of the tagged context in document
   */

   private static int insertLinks ( int k, Vector v, Reference[] refList ) {
      if ( v == null ) return k;

      if (DEBUG) System.err.println(ME+" in insertLinks at position " + k );

      int resume = k;                // where to start next
      kz = 0;                        // accumulate length in chunks

      for (int i=0; i<v.size(); i++) {
	 k = resume;                 // handles multiple anchors
	 Context c = (Context) v.elementAt(i);
	 if ( DEBUG )
	 System.err.println(ME+"Calling PCDATA to encode this string:=>" +
	    c.getAnchor() + "<=");
	 String a = Utils.PCDATA(c.getAnchor());
	 int j = locateAnchor (k, a);
	 if ( j == -1 ) { 
	    System.err.println(ME+"anchor ->" + a + "<- not found in text:\n"
	       + doc );
	 } 
	 else {
	    j += k;    //locateAnchor returns int relative to k
	    // end of anchor relative to beg. of search
	    kz += j+a.length()-k;   
	    int refIndex = c.getRefIndex();
	    Reference r = refList[refIndex];
	    String d = (r.getDocID()).getDate();
	    d=d.length()>=4?d.substring(0,4):"";
	    // tag can be null, so check it before calling PCDATA on it
	    String replacementText =
	       "\n<reflink ord=\"" + (refIndex+1) + "\" author=\""
	       + Utils.PCDATA((r.getDocID()).getFirstAuthorLastName()) + "\""
	       + " year=\""+d+"\"\n"
	       + "   title=\""+Utils.PCDATA((r.getDocID()).getTitle())+"\"\n"
	       + "   literal=\""
	       + (r.getTag()==null?"":Utils.PCDATA(r.getTag())) +" "
	       + Utils.PCDATA(r.getLiteral())+"\">\n"
	       + getURLS(r)
               + a + "</reflink>";
	    document.replace(j,j+a.length(),replacementText);
	    resume = Math.max( resume, j+replacementText.length() );
	    if ( DEBUG ) {
	       System.err.println(ME+"inserted a link at position "
	       + j +": document[j-20:j+30] is:\n" +
	       document.substring(j>=20?j-20:0,
	       j<document.length()-30?j+30:document.length()));
	       System.err.println(ME+"insertLinks set kz to " + kz);
	       System.err.println(ME+"resuming at " + resume + " ->" 
	       + document.substring(resume,
	       resume+50<document.length()?resume+50:document.length()));
	    }
	 }
      }

      return resume;         // where to resume scanning

   } // insertLinks


   // punt -
   /** recovers from a situation where the context cannot be found
   * in the document starting at position k probably because it
   * keeps running into tagged elements.
   * @param position in the global document StringBuffer at which
   * to scan forwards for the context
   * @param the context
   * @returns the position in the global StringBuffer at which this
   * context begins.  Serious error if context cannot be found.
   */
   private static int punt ( int k, String s ) {
      if ( DEBUG ) System.err.println(ME+"in punt looking for context '"
	 + s + "' in document k ... '" + document.substring(k,k+50)+"'");

      int i=k;              // pointer into document StringBuffer
      int j=0;              // index into String s
      char d;               // character from the document
      char c;               // character from the context
      int istart=-1;        // where in document the context begins

      while ( i < document.length() ) {
	 d = document.charAt(i++);
	 if ( d == '<' ) {
	    while ( i<document.length() && (d=document.charAt(i++)) != '>');
	    if ( i == document.length() ) break;
	    d = document.charAt(i++);
	    System.err.println("...skipped tag ending at i-20:i+20 "+i
	        + document.substring 
		( i>20?i-20:0,i<document.length()?i+20:document.length() ) );
            System.err.println("i="+i+", j="+j+", d="+d);
	 } 
	 // We have skipped over the next tag in the document.  Look
	 // for context starting with character 'd' (position i-1)
         while ( j < s.length() ) {
	    c = s.charAt(j++);
	    if ( c==d ) {
	       if ( istart == -1 ) istart = i-1;
               if ( document.charAt(i) == '<' ) break;
	       else d = document.charAt(i++);
	    } else {  // start over
               if ( d == '<' ) break;
	       j=0; istart=-1;break;
	    }
	 }
	 if ( j == s.length() ) return istart;
      }

      System.err.println(ME+"in punt could not find context '" + s 
	 + "' in document even ignoring all tags!\n"
	 + document.substring(k));

      return istart;        // Starting position in "document" of "context"

   } // punt

   // getURLS -
   /** extracts the online locations, if any, for a given work.
   * @param The Reference to the work
   * @returns a String suitable for output as part of a <reflink>
   * The returned string is a sequence of 0 or more <url> elements
   */
   private static String getURLS ( Reference r ) {
      String[] locs = (r.getDocID()).getURLs();
      if ( locs == null ) return "";
      StringBuffer result = new StringBuffer();
      for ( int i=0; i<locs.length; i++ ) 
	 result.append("   <url>"+locs[i]+"</url>\n");
      return result.toString();
   }

   // insertBaseURL -
   /** Reads in the XML document and inserts a Base URL to the 
   * original network address, so that images, etc. can be fetched.
   * @param The xhtml document
   * @param The base URL
   * @returns a String Buffer containing the altered document
   */
   private static StringBuffer insertBaseURL ( String document, String url ) {
      StringBuffer doc = new StringBuffer ( document );
      int i = document.indexOf("</head>");
      doc.insert(i,"<BASE href=\""+url+"\"/>\n");
      return doc;
   }


   // locateAnchor -
   /** Finds where this anchor occurs next in the document, as text.
   * @param Where in "document" to begin looking
   * @param The anchor
   * @returns first location of anchor which is not inside an <a> element
   * or -1 if no such location exists within 1000 characters of k.
   */
   private static int locateAnchor ( int k, String anchor ) {

      // Get rid of the newlines (does not change the positioning)
      String docx = (document.substring(k,
	    k+2000<document.length()?k+2000:document.length())).replace
	    ('\n',' ');

      // Ignore any occurences inside an anchor element
      int j = docx.indexOf(anchor);
      while (j != -1 && (docx.substring(j+anchor.length(),
			     j+anchor.length()+2)).equals("\">") )
	 j = docx.indexOf(anchor, j+anchor.length());
      if ( DEBUG ) System.err.println(ME+"locateAnchor returning " + j);
      return j;
   }

} // ends class RefLinker
