// bergmark - September 2000 - Reference Linking project

package Linkable.Analysis;

// ReferenceSection

/** The ReferenceSection class is responsible for analyzing contents of the
  * reference section of an online paper.  It makes use of Southampton's
  * dodecite for individual references.  One ReferenceSection object is
  * instantiated for each paper analyzed, whether or not there are references.

  * Each reference consists of a tag (possibly null) and a reference string.
  * The reference string contains author names, publication year, title of
  * the work.

  * As each reference string is parsed, it is turned into a Reference object and
  * stored in a vector of Reference objects.  It is also looked up in/added
  * to the Creation database.

  * The buildRefList() procedure can be called to obtain the final
  * array of Reference objects.

  * The static isRefSection procedure can be called to determine whether
  * a header is the start of a Reference Section.

  * All this code has been moved here from the various parsers.  Code in
  * this class should be totally independent of whether an HTML, XHTML,
  * ASCII, or Postscript document is being parsed.  This code works with
  * unmarked up hunks of text.
  */

// Update History:
// 2000-09-06: Created from code extracted from XHTMLAnalyzer.java
// 2000-10-27: (rev 1.1) Change tagIsIn from returning a boolean to
// returning an integer equivalent (0=false, 1=true, reference in
// context is contained somewhere in the ReferenceString, 2=true,
// reference in context is perfect match for tag of reference string).
// This change was made to help prevent the picking up of bogus contexts.
// 2000-10-27: trim the tag before looking up context for it (handles
//      tags like "[ 2]").
// 2000-10-30: compare references in contexts and tags in reference
//      strings on a lower-case basis
// 2000-11-03: Add code to Reference Section to handle generated ordinals,
//      and move it out of RefLinkAnalyzer.
// 2000-12-04: arrange for literal Anchor to be stored in buildRefList
//      so that it is available when it comes time to make a Context object
// 2000-12-12: Context object gets a fourth field: number of reference
//      and a 5th field, the ordinal of the context string in the doc.
// 2000-12-14: Corrected a bracket problem with new Context objects.
// 2001-03-27: Fixed a bug in buildRefList
// 2001-04-17: Handle SurrogateException that's thrown by Reference constructor
// 2001-06-13: Add "see" and other words to the useless word list in tagIsIn
//             Fix bug in value returned by "notAllDigits"
// 2001-06-23: Add in citation part in builtRefList() function (labeled 5)
//            BY: Yu Chen
// 2001-07-24: Handle reference tags with format <digits><blanks><Capitalized
//       string>, e.g. "13 Koster,Martijn. "
// 2001-06-08:  Fix null pointer bug in massageContexts (it was triggered
//              if in DEBUG mode)


import Linkable.API.Reference;
import Linkable.API.Creation;
import Linkable.API.SurrogateException;
import Linkable.API.Surrogate;
import Linkable.Utility.CiteRefDatabase;
import Linkable.Utility.CreationDatabase;
import Linkable.Utility.Context;
import Linkable.Utility.CONFIG;

import java.util.Vector;
import java.util.StringTokenizer;
import java.util.NoSuchElementException;
import java.util.*;

public class ReferenceSection {

   private static final String ME = "ReferenceSection: ";
   private static final boolean DEBUG = CONFIG.DEBUG;

   private int hint = -1;                // kind of tags this doc has
   private int contextHint = -1;         // kind of references in contexts

   // Some hints on what format the reference strings are in
   private static final int NULL = 0;    // no tags
   private static final int NUMERALS=1;  // digits + '.'
   // the remaining constants are just contextHint+2;

   private Vector w = new Vector();    // temporary for reference strings
   private String[] inverted=null;     // tag-strings for each context
   private String[][] anchors = null;
   private Context[] contexts=null;    // contexts for one reference

   private String reference = "";      // holds one reference
   private int n=0;                    // number of chunks in current ref
   private int ord=0;                  // for generated ordinals
   private boolean generatingOrdinals=false;

   private int anchorIndex=-1;         // which anchor wins in this context

   //========== Protected Routines Used by RefLinkAnalyzer =============

   /**
    * default constructor
    */
   protected ReferenceSection ( ) {
      if (DEBUG) {
         System.err.println(ME+"constructing a new ReferenceSection.");
      }
   }

   /** generateOrdinals
    * The parser has discovered that references are having their tags
    * generated by list macros.  This procedure is called to get
    * generation of ordinals turned on.
    */
   protected void generateOrdinals() {
      generatingOrdinals = true;
   }

   /**
    * addText appends a hunk of text to the reference being accrued
    * @param The text to be added (an unmarked-up String)
    */
   protected void addText ( String textString ) {
      reference += textString;
      n++;
      if (DEBUG) System.err.println(ME+"reference="+reference);
   }

   /**
    * endText() is called when one complete reference string has been
    * handed to this object via (perhaps repeated) calls to addText();
    */
   protected void endText() {
      reference=reference.replace('\n',' ').trim();
      if (DEBUG)
      System.err.println (ME+"full reference(read in " + n +
         " chunks) is <"+reference+">");
      if (!reference.equals("")) {
	 if ( generatingOrdinals ) {
	    reference = Integer.toString(++ord)+". "+reference;
	 }
	 w.addElement(reference);
      }
      reference=""; n=0;
   }

   // buildRefList -
   /**
    * Extracts the list of References held in this ReferenceSection object.
    * @param URN of the item for which this is the Reference Section
    * @param contextTrees is a Vector of SentenceTrees that
    * contained reference normalizations within the text of this document.
    * If it is null, none of the Reference objects will have any context,
    * which means they weren't references after all, probably.
    * @returns An array of Reference objects, possibly null.
    * No array element should be null, because it should have at least
    * the reference string.  But it will be null if the reference
    * could not be parsed.  Thus refList[] elements should be checked
    * for nullness before use.
    *
    * Side effects: Each reference gets added to the Creation database
    *               The Citation Database is updated for this sourceURN
    */
   protected Reference[] buildRefList (String sourceURN, Vector contextTrees) {

      if (w == null) {
	 System.err.println(ME+" in buildRefList finds null w");
	 return null;
      }

      massageContexts( contextTrees );

      // refList will be filled up from the w vector and returned to caller
      Reference[] refList = new Reference[w.size()];

      if ( DEBUG )
      System.err.println(ME + " starting on references");
      for (int i = 0; i < w.size(); i++) {

         String refString = (String)w.elementAt(i);

	 // (1) Figure out what the tag is
         StringTokenizer st = new StringTokenizer(refString,"[](){}. ",true);
         String tag = null;
         try {
              tag = st.nextToken();
              while ( tag.equals("[") || tag.equals("(")
	         || tag.equals("{" ) || tag.equals(" ") ) tag = st.nextToken();
         } catch (Exception NoSuchElementException){}
	 tag = tag.trim();
	 if ( DEBUG )
         System.err.println(ME+"handling reference tag [" + tag + "]");

	 // (2) Collect best contexts for this tag
         int prevValue=0, value, numberOfContexts = 0;
	 if ( contextTrees != null ) {
	    Vector saved=new Vector();
	    Vector values=new Vector();
	    Vector normalizations = new Vector(); // normalization that was used
	    Vector literalAnchors = new Vector(); // anchors (literally)
            for (int j=0; j < inverted.length; j++ ) {
	       // Note: following call also sets anchorIndex
	       value = tagIsIn( refString.toLowerCase(),
				"["+tag+"]",
				inverted[j] );
               if ( value > 0 ) {
		  // clear vector if previous quality was less than current
		  if ( prevValue < value ) {
		     saved.clear();values.clear();normalizations.clear();
		     literalAnchors.clear();
		     numberOfContexts = 0;
		  }
		  // ignore values lesser than previous quality
		  if ( value >= prevValue ) {
                     numberOfContexts++;
		     saved.addElement(new Integer(j));
		     values.addElement(new Integer(value));
		     prevValue = value;
		     if ( value == 2 ) {
			normalizations.addElement ( "[" + tag + "]");
                        literalAnchors.addElement (tag);
		     }
		     else if ( value == 1 ) {
		        normalizations.addElement(findAnchor(inverted[j],anchorIndex));
			literalAnchors.addElement(anchors[j][anchorIndex]);
		     }
		  }
               }
	    }
	    // It is an error if the following accounts are not equal
	    if ( DEBUG ) System.err.println(ME
	    +"saved has " + saved.size() + " elements, "
	    + "literalAnchors has " + literalAnchors.size() + " elements, "
	    +"and normalizations has " + normalizations.size() + " elements.");
            contexts = new Context[numberOfContexts];
            for (int j=0; j<numberOfContexts;j++) {
               int k = ((Integer)saved.elementAt(j)).intValue();
               contexts[j] = new Context (
		  (String)literalAnchors.elementAt(j),
	          (String)normalizations.elementAt(j),
                  ((((SentenceTree)contextTrees.elementAt(k)).text())
                     .replace('\n',' ')).trim(), i, k+1);
	       if ( DEBUG )
	       System.err.println(ME+"new Context:\n"
	       + "\""+(String)literalAnchors.elementAt(j)+"\" "
	       + (String)normalizations.elementAt(j)
	       +((SentenceTree)contextTrees.elementAt(k)).text()
	       + " refIndex="+i+", ordinal=" + (k+1) );
            }
            saved=null;       // All contexts have been copied; free up memory
	    values=null;
	    normalizations = null;
	    literalAnchors = null;
	 }

         // The tag is null if it is followed by "." and not all digits
         if ( st.hasMoreTokens() && st.nextToken().equals(".")
				  && notAllDigits (tag) ) tag = null;
         else if (refString.length() > tag.length() )
         refString = refString.substring(refString.indexOf(tag)+
            tag.length()+1).trim();  // strip the tag off refString

      // (3) convert this reference string to a Creation r
      //     If r == null, that means the Reference String failed to parse

         Creation r = Decite.parse ( refString );

      // (4) add "b cites r" to the cite-ref database
      if ( r != null ) {

      if (r.iExtended > 0) {
        StringBuffer tempUrn = new StringBuffer( r.getURN());
        tempUrn.delete(tempUrn.length() - r.iExtended, tempUrn.length());
        String tUrn = tempUrn.toString();
        r.setUrn(tUrn.trim());
      }

        String targetURN = r.getURN().trim();
         String indexOfRef = Integer.toString(i+1);
         String[] source = {sourceURN.trim(), indexOfRef};
         CiteRefDatabase.addCiteRef(source,targetURN);
      }

	 // new Reference throws SurrogateException if passed a null r
	 try { refList[i] = new Reference(r,i+1,tag,refString,contexts); }
	 catch (SurrogateException sw) {}

      // (5) for each reference, this item is its citor
      //     so need to update each reference's citationList

        String docID = CreationDatabase.getDOI(r.getURN());
        String sDir = CONFIG.REPOSITORY + "/" + docID + "/Surrogate";

	if (DEBUG) {
           System.err.println(ME+"In Building RefList URN: " + r.getURN());
           System.err.println(ME+"In Building RefList doi: " + docID);
	}
        if ( docID != null && docID.length()>2) {
            Surrogate sRef = new Surrogate(sDir, r.getURN(), docID);
            sRef.addCitation(Surrogate.buildCitation(refList[i]));
            sRef.save();
        }
        else {
	    if (DEBUG)
            System.err.println("The WORK-" + r.getURN() + " does not exist!");
        }

      } // done with reference "i"

      return refList;
   }  // buildRefList

   /**
    * refCount returns the number of references collected so far
    * @returns integer count
    */
   protected int refCount() { return (w==null?0:w.size()); }

   /**
    * isRefSectionHeader checks for reference section header.
    * @param A String of text, trimmed, that should be a header
    * @returns true if the header could be the start of a reference section
    */
   protected static boolean isRefSectionHeader ( String textString ) {
         return textString.equals("References")
	    ||  textString.equals("REFERENCES")
            ||  textString.equals("Notes and References")
            ||  textString.equals("Note and References")
            ||  textString.equals("Bibliography")
	    ||  textString.equals("Bibliography and Notes")
            ||  isRefSection (textString) ;
   }

   //=================   PRIVATE PROCEDURES =======================

   /* findAnchor -
    * Given the string "References in context: [...][...]..."
    * and integer 0 <= j < number of bracketed strings, return the j-th one
    */
   private String findAnchor ( String listOfAnchors, int j ) {
      if ( DEBUG )
      System.err.println(ME +"in findAnchor to find " + j + "-th anchor in "
      + listOfAnchors);
      int m = listOfAnchors.indexOf("[");
      int n = listOfAnchors.indexOf("]");
      int k = 0;
      while ( k < j ) {
         listOfAnchors = listOfAnchors.substring(n+1);
         m = listOfAnchors.indexOf("[");
         n = listOfAnchors.indexOf("]");
	 k++;
      }
      return listOfAnchors.substring(m,n+1);
   }

   // Return 2 if it appears that reference tag "tag" is contained
   // in the contextLinks, which is a string like "References in this
   // context: [1][7][8]"
   // or "References in this context: Bearman and Trant, 1998"
   // Return 1 if each significant word in this context appears
   // somewhere in refString (refString has been lower-cased)
   // (Insignificant words: "see", "et al.", "et al", "&", "and")
   // 12-11-16: if an anchor is used to achieve a match, store its index
   // in anchorIndex, a global variable.  Relevant only if returning 1.
   // Return 0 if this context is not a match for this tag

   private int tagIsIn
   ( String refString, String tag, String contextLinks ) {
      if ( tag != null ) {
         // return 2 if the tag is literally contained in this context set
         if ( contextLinks.indexOf(tag) > 0 ) return 2;
         // Don't look any further for numeric tags
         if ( contextHint == SentenceTree.SQUARE_BRACKETS_AROUND_NUMERALS )
	    return 0;
      }
      // examine each reference in contextLinks to see if it's in refString
      // A shortcoming of this algorithm is that there is only a string
      // match to see if "token" is in "refString" but not necessarily
      // as a standalone word in refString
      StringTokenizer st = new StringTokenizer(
		  contextLinks.toLowerCase()," [],", true );
      char state = 'a';
      anchorIndex = -1;
      while (st.hasMoreTokens() ) {
         String token = st.nextToken();
         switch (state) {
         case 'a':                        // start state looking for "["
            if ( token.equals("[") ) state = 'b';
            break;
         case 'b':                        // looking for next word
            if ( token.equals("]") ) { anchorIndex++; return 1; }
            if ( !token.equals("&") &&
		 !token.equals("see") &&
		 !token.equals("also") &&
		 !token.equals("for") &&
		 !token.equals("and") &&
		 !token.equals("others") &&
		 !token.equals("et") &&
		 !token.equals("al") &&
		 !token.equals("al.") &&
		 !token.equals(" ") &&
		 !token.equals("example") &&
		 !token.equals("from") &&
		 !token.equals("adapted") &&
		 !token.equals(",") ) {
               if (refString.indexOf(token) == -1) state = 'c';
            }
            break;
         case 'c':                        // look for "]"
            if ( token.equals("]") ) { state = 'a'; anchorIndex++; }
            break;
         default:
	    if (DEBUG)
            System.err.println(ME+"tagIsIn got funny token in state " + state +
": " + token);
            return 0;
         }  // end switch
      }
      return 0;
   }  // tagIsIn


   // massageContexts -
   /** extracts information from the sentences saved in "contextTrees".
   * @param The context sentences found while scanning the text.
   * Note: these global variables become defined as a result of massageContexts:
   *    anchors - the array is allocated, one row per context
   *              Then each row is filled in with String[] sentence.getAnchors
   *    inverted[i] set to "References in this context: [ref1][ref2]..."
   *               for the i-th context
   */
   private void massageContexts( Vector contextTrees ) {
      SentenceTree sentence = null;   // working variable for one tree

      if ( DEBUG && contextTrees != null ) {
         System.err.println(ME+" has collected " + w.size() + " references");
         System.err.println(ME+
            contextTrees.size() + " contexts are as follows:" );
         for ( int i=0; i<contextTrees.size(); i++ ) {
            sentence = (SentenceTree)contextTrees.elementAt(i);
            System.err.println("  ("+i+") "+ sentence.dump() );
            System.err.println("\n" + sentence.getTags() );
         }
	 System.err.println("\n");
      }

      // Save the reference tags found in the contexts for later searching
      // Also save the type of context reference for later use.
      if (DEBUG) System.err.println(ME+"References in contexts:\n");
      anchors = new String[contextTrees.size()][];
      if (DEBUG) System.err.println(ME+"has constructed an array of "
      + anchors.length + " anchor arrays");
      if ( contextTrees != null ) {
         inverted = new String[contextTrees.size()];
         for (int i=0; i<contextTrees.size(); i++) {
            sentence = (SentenceTree)contextTrees.elementAt(i);
            inverted[i] = sentence.getLinks();
	    anchors[i] = sentence.getAnchors();
            if ( DEBUG )  { // print out references in each of the contexts
               System.err.print(ME+"("+i+") " + inverted[i] +
		  (anchors[i]==null?"0":Integer.toString(anchors[i].length))
	          + " anchors");
	       if ( anchors[i] != null )
	          for (int j=0; j<anchors[i].length; j++)
	             System.err.print("  \"" + anchors[i][j] + "\"");
	       System.err.println("\n");
	    }
         }
	 if ( sentence != null) {
	    contextHint = sentence.getHint();
	    if ( DEBUG ) System.err.println(ME+" contextHint set to "
		      + contextHint );
	 } else contextHint = -1;            // must have been zero contexts
	 if (DEBUG)System.err.println(ME+
	    "inverted reference lists have been computed");
      }
   } // massageContexts

   //============  Static Procedures ============================

   //      not AllDigits
   /** Determine whether a tag is all digits or not.
    *  @param is the tag (a String)
    *  @returns true if the tag has only digits in it
    *  Note: handle special case where the final character is a letter
    *  which means that i==tag.length() but the routine should return
    *  true and not false.  Assert: tag has length of at least one.
    */
   private static boolean notAllDigits ( String tag ) {
      int i=0;
      while (i<tag.length() && !Character.isLetter(tag.charAt(i++)));
      return Character.isLetter(tag.charAt(i-1)) || !(i==tag.length());
   }

   // We come to a reference section if this is a header of the
   // form <section#> ["."] ( "References" | "Bibliography" )
   private static boolean isRefSection ( String text ) {
      if ( DEBUG )
      System.err.println(ME+" in isRefSection with <"+text+">");
      if (text == null || text.equals(""))return false;
      int i = 0; if ( !Character.isDigit(text.charAt(i++)) ) return false;
      while ( i < text.length() && Character.isDigit(text.charAt(i++)) );
      if ( i == text.length() ) return false;
      if (text.charAt(i) == '.') i++;
      StringTokenizer st = new StringTokenizer ( text.substring(i) );
      if (st.hasMoreTokens()) {
         String token = st.nextToken();
         if ( token.equals("References") || token.equals("Bibliography") ) {
	 if ( DEBUG )
         System.err.println(ME+"isRefSection returns " + !st.hasMoreTokens() );
         return (! st.hasMoreTokens() );
         }
      }
      return false;
   }

} // end ReferenceSection class

