package uk.ac.soton.harvester;
// DECITER.JAVA 
// VERSION 1.0.1 Tue Mar 28 07:33:51 UTC 2000
// VERSION 0.9.7 Friday January 28 10:47:56 GMT 2000
// VERSION 0.9.3 Friday January  7 14:40:50 GMT 2000
// AUTHOR: Les Carr, Multimedia Research Group, University of Southampton
// COPYRIGHT: University of Southampton
// LICENSED TO: INGENTA

import java.io.*;

/**
 * deciter class does all the significant work in decoding a set of citations
 *
 **/
public class Deciter {

	/**
	 *
	 * Constructor sets the value of the article ID
	 * and extracts the hints and flags from the
	 * array of options passed on the command line.
	 * Then it sets up the default values for the AttributeMarker objects
	 * and the citation outputter.
	 * djb: add keyword public so that Cornell code can construct this
	 *
	**/
	public Deciter(String id, String opts[]){
		ds=new DeciterState(id, opts);

		setDefaultAttributeMarkerObjects();
		setDefaultCitationOutput();

		processOptions(opts);
		}

	/**
	 *
	 * ds maintains the essential state that guides and results from
	 * the deciting process, mainly hints and markers.
	 *
	 **/
	private DeciterState ds = null;


	/**
	 *
	 * am holds the AttributeMarker objects which are the implementations
	 * of the recognisers for each particular attribute of a citation.
	 *
	**/
	private AttributeMarker am[]=new AttributeMarker[N_AMS];

	/**
	 *
	 * PREPROCESS is the index of the object in the AttributeMarkers array that performs
	 * any initial preprocessing before the recognition proper gets underway.
	 *
	**/
	static final int PREPROCESS=0;

	/**
	 *
	 * NUMBERING is the index of the object in the AttributeMarkers array that recognises
	 * any initial preporocessing before the recognition proper 
	 gets underway.
	 *
	**/
	static final int NUMBERING=1;

	/**
	 *
	 * DATE is the index of the object in the AttributeMarkers array that recognises
	 * the position of the date in the citation string.
	 *
	**/
	static final int DATE=2;

	/**
	 *
	 * AUTHORS is the index of the object in the AttributeMarkers array that recognises
	 * the position of the authors in the citation string.
	 *
	**/
	static final int AUTHORS=3;

	/**
	 *
	 * TITLE is the index of the object in the AttributeMarkers array that recognises
	 * the position of the title in the citation string.
	 *
	**/
	static final int TITLE=4;

	/**
	 *
	 * PAGERANGE is the index of the object in the AttributeMarkers array that recognises
	 * the position of the pagerange in the citation string.
	 *
	**/
	static final int PAGERANGE=5;

	/**
	 *
	 * PUBLICATION is the index of the object in the AttributeMarkers array that recognises
	 * the position of the journal title in the citation string.
	 *
	**/
	static final int PUBLICATION=6;

	/**
	 *
	 * VOLUMEISSUE is the index of the object in the AttributeMarkers array that recognises
	 * the position of the volume and issue in the citation string.
	 *
	**/
	static final int VOLUMEISSUE=7;

	/**
	 *
	 * PUBLISH is the index of the object in the AttributeMarkers array that recognises
	 * the position of the publisher in the citation string.
	 *
	**/
	static final int PUBLISH=8;

	/**
	 *
	 * PLACE is the index of the object in the AttributeMarkers array that recognises
	 * the position of the place of publication in the citation string.
	 *
	**/
	static final int PLACE=9;
	/**
	 *
	 * EXTRA is the index of the object in the AttributeMarkers array that recognises
	 * the position of any extra features (e.g. xxxid) in the citation string.
	 *
	**/
	static final int EXTRA=10;

	/**
	 *
	 * POSTPROCESS is the index of the object in the AttributeMarkers array that performs
	 * any subsequent postprocessing and rationalisation of the marker values.
	 *
	**/
	static final int POSTPROCESS=11;


	/**
	 *
	 * URL is the index of the object in the AttributeMarkers array that 
	 * recognizes an http address if present in the citation
	 *
	**/
	static final int URL=12;                                     // djb


	/**
	 *
	 * N_AMS is the number of AttributeMarkers that are used.
	 *
	**/
	//static final int N_AMS=POSTPROCESS+1;
	static final int N_AMS=URL+1;                                // djb


	/**
	 *
	 * setDefaultAttributeMarkerObjects simply fills the
	 * AtrributeMarkers array with a default set of AttributeMarker objects.
	 *
	**/
	private void setDefaultAttributeMarkerObjects(){
		am[PREPROCESS]=new DoNothing();
		am[NUMBERING]=new DoNumbering();
		am[DATE]=new DoDate();
		if(ds.xxxHint) am[AUTHORS]=new DoAuthors2();
		else am[AUTHORS]=new DoAuthors();
		am[TITLE]=new DoTitle();
		am[PAGERANGE]=new DoPageRange();
		am[PUBLICATION]=new DoPublication();
		am[VOLUMEISSUE]=new DoVolAndIssue();
		if(ds.xxxHint)am[EXTRA]=new DoXXXId();
		else am[EXTRA]=new DoNothing();
		am[PUBLISH]=new DoPublisher();
		am[PLACE]=new DoPlace();
		am[POSTPROCESS]=new DoTidyUpMisc();
		am[URL] = new DoLink();                            // djb
		}


	/**
	 *
	 * setAttributeMarker allows the recogniser for a particular attribute
	 * to be changed. The anticipated use is setAttributeMarker(DATE, new MyDateRecogniserClass());
	 * It ignores the request if the attribute code is not valid.
	 *
	 * @param which one of the values PREPROCESS, NUMBERING, DATE, AUTHORS, TITLE,
	 *	PAGERANGE, VOLUMEISSUE, EXTRA, POSTPROCESS
	 * @param a an object which implements the AttributeMarker interface
	**/
	public void setAttributeMarker(int which, AttributeMarker a){
		if(which >=PREPROCESS && which <=POSTPROCESS)
			am[which] = (a==null) ? new DoNothing() : a;
		}

	/**
	 *
	 * setAttributeMarker allows the recogniser for a particular attribute to be changed.
	 * The anticipated use is setAttributeMarker(DATE, "MyDateRecogniserClass");
	 * This version of the method is provided so that the class name can be given as data,
	 * for example as a command line argument or in a configuration file.
	 *
	 * It ignores the request if the attribute code is not valid. If the name given
	 * doesn't correspond to a findable class, if the class is badly constructed
	 * or if it is not actually an AttributeMarker then an error message is printed
	 * and the request is ignored.
	 *
	 * @param which one of the values PREPROCESS, NUMBERING, DATE, AUTHORS, TITLE,
	 *	PAGERANGE, VOLUMEISSUE, EXTRA, POSTPROCESS
	 * (djb) or URL
	 * @param amName a String which gives the name of a class which implements the
	 * 	AttributeMarker interface. A new instance of this class will be created.
	**/
	public void setAttributeMarker(int which, String amName){
		if(which < PREPROCESS || which > POSTPROCESS) return;

		Class amClass=null;
		try{ amClass=Class.forName(amName); }
		catch(ClassNotFoundException e){
			System.err.println("Can't find class "+amName); return;
			}
		catch(ExceptionInInitializerError e){
			System.err.println("Initializer error with class "+amName); return;
			}
		catch(LinkageError e){
			System.err.println("Linkage error with class "+amName); return;
			}

		Object amObject=null;
		try{amObject=amClass.newInstance();}
		catch(Exception e){
			System.err.println("Can't create an instance of class "+amName+" ("+e+")"); e.printStackTrace(); return;
			}

		if(amObject instanceof AttributeMarker)
			setAttributeMarker(which, (AttributeMarker)amObject);
		else {
			System.err.println("Class "+amName+" isn't an AttributeMarker"); return;
			}
		}


	/**
	 *
	 * a version of setAttributeMarker which is useful for argv.
	 *
	**/
	public void setAttributeMarker(String which, String amName){
		if(which.equals("PREPROCESS"))setAttributeMarker(PREPROCESS, amName);
		else if(which.equals("NUMBERING"))setAttributeMarker(NUMBERING, amName);
		else if(which.equals("DATE"))setAttributeMarker(DATE, amName);
		else if(which.equals("AUTHORS"))setAttributeMarker(AUTHORS, amName);
		else if(which.equals("TITLE"))setAttributeMarker(TITLE, amName);
		else if(which.equals("PAGERANGE"))setAttributeMarker(PAGERANGE, amName);
		else if(which.equals("PUBLICATION"))setAttributeMarker(PUBLICATION, amName);
		else if(which.equals("VOLUMEISSUE"))setAttributeMarker(VOLUMEISSUE, amName);
		else if(which.equals("PUBLISH"))setAttributeMarker(PUBLISH, amName);
		else if(which.equals("PLACE"))setAttributeMarker(PLACE, amName);
		else if(which.equals("EXTRA"))setAttributeMarker(EXTRA, amName);
		else if(which.equals("POSTPROCESS"))setAttributeMarker(POSTPROCESS, amName);
		else if(which.equals("URL"))setAttributeMarker(URL, amName);  // djb
		else System.err.println("Unknown attribute type: "+which);
		}

	/**
	 *
	 * cit_out is the object which will print out the current citation. Printing is based on the
	 * citation data stored in the DeciterState.
	 *
	**/
	private CitationOutput cit_out;


	/**
	 *
	 * setDefaultCitationOutput sets up the default kind out citation output object.
	 * This is the vanilla XML object. Alternatives are the HTML and plain text outputters.
	 *
	**/
	private void setDefaultCitationOutput(){
		cit_out=new XMLOutput(ds);
		}

	/**
	 *
	 * setCitationOutput specifies the citation output object.
	 * The standard choice is from a XML , HTML and plain text outputter objects.
	 *
	 * @param co an object from the CitationOutput-derived class which will be used
	 * 	for printing the citation data.
	**/
	public void setCitationOutput(CitationOutput co){
		cit_out=co;
		}

	/**
	 *
	 * setCitationOutput specifies the citation output object.
	 * The standard choice is from a XML , HTML and plain text outputter objects.
	 * This version of the method is provided so that the class name can be given as data,
	 * for example as a command line argument or in a configuration file.
	 *
	 * If the name given
	 * doesn't correspond to a findable class, if the class is badly constructed
	 * or if it is not actually an AttributeMarker then an error message is printed
	 * and the request is ignored.
	 *
	 * @param coName the name of a CitationOutput-derived class which will be used
	 * 	for printing the citation data.
	**/
	public void setCitationOutput(String coName){
		Class coClass=null;
		try{ coClass=Class.forName(coName); }
		catch(ClassNotFoundException e){
			System.err.println("Can't find class "+coName); return;
			}
		catch(ExceptionInInitializerError e){
			System.err.println("Initializer error with class "+coName); return;
			}
		catch(LinkageError e){
			System.err.println("Linkage error with class "+coName); return;
			}

		Object coObject=null;
		try{coObject=coClass.newInstance();}
		catch(Exception e){
			System.err.println("Can't create an instance of class "+coName); return;
			}

		if(coObject instanceof CitationOutput){
			CitationOutput co=(CitationOutput)coObject;
			co.setDeciterState(ds);
			setCitationOutput(co);
			}
		else {
			System.err.println("Class "+coName+" isn't a CitationOutput"); return;
			}
		}

	private void processOptions(String opts[]){
		if(opts==null)return;

		for(int c=0; c<opts.length; c++){
			if(opts[c].startsWith("-s")){
				String line=opts[c].substring(2);
				int eqIndex=line.indexOf('=');
				if(eqIndex<0)continue;
				String attr=line.substring(0,eqIndex);
				String val=line.substring(eqIndex+1);
				setAttributeMarker(attr,val);
				}
			else if(opts[c].startsWith("-c")){
				String val=opts[c].substring(2);
				setCitationOutput(val);
				}
			}

		}

	/**
	 *
	 * dodecite handles the whole deciting process for a single citation entry.
	 * If a multiCite hint is seen and a citation separator is spotted then
	 * the citation is appropriately split and dodecite is called recursively
	 * on the fragment. If no multiCiteSpearator is seen, then dodecite_simple
	 * is invvoked.
	 *
	 * @param line the string containing the citation under scrutiny
	 * @param pr the page number of the article which contained this citation
	 * @param wr the word number at which this citation started on the page
	 * @param Output the PrintWriter to which all output must be sent
	**/
	public void dodecite(String line, String pr, String wr, PrintWriter Output){
		int cs=line.indexOf(ds.MDashCiteSep);
		boolean seenMDash=cs>=0;

		if(ds.multiCiteMDashHint && seenMDash){
			String nextCite=line.substring(0,cs);
			line=line.substring(cs+ds.MDashCiteSep.length());
			Utils.DEBUG("NEXTCITE=>"+nextCite);
			dodecite(nextCite, pr, wr, Output);
			String aut=ds.firstAuthor;

			while(line.length()!=0){
				cs=line.indexOf(ds.MDashCiteSep);
				seenMDash=cs>=0;
				if(seenMDash){
					nextCite=line.substring(0,cs);
					line=line.substring(cs+ds.MDashCiteSep.length());
					}
				else{
					nextCite=line;
					line="";
					}
				Utils.DEBUG("NEXTCITE=>"+nextCite);
				dodecite_simple(aut+" "+nextCite, pr, wr, Output);
				}
			}
		else dodecite_simple(line, pr, wr, Output);
		}


	/**
	 *
	 * dodecite_simple handles the whole deciting process for a single citation
	 * (sub)entry.
	 * The various citation attributes are searched for in the following order:
	 * numbering, whitespace'n'tags, date, authors, title, page range,
	 * volume and issue, xxxid. The recognised data is copied into strings
	 * and then output (using splitAuthors and splitPageRange for the structured
	 * elements).
	 *
	 * If significant citation material is found to be left over with a multiCite
	 * hint in operation, it may be assumed that another citation occurrence has
	 * been found and dodecite may be called recursively.
	 *
	 * @param line the string containing the citation under scrutiny
	 * @param pr the page number of the article which contained this citation
	 * @param wr the word number at which this citation started on the page
	 * @param Output the PrintWriter to which all output must be sent
	**/
	protected void dodecite_simple(String line, String pr, String wr, PrintWriter Output){
		int i=0;
		int maxi=line.length()-1;

		ds.setNewCitation(line);

		ds.nCites++;
		Utils.DEBUG("DECITING["+ds.nCites+"] ("+maxi+")]->"+line);


	try{
		i=am[PREPROCESS].markAttribute(ds,i);

		//even though the date probably comes after the authors, find
		//the date first, so it is easy to see where the authors end
		int endOfDate=am[DATE].markAttribute(ds,0);

		i=am[NUMBERING].markAttribute(ds,i);
		i=am[AUTHORS].markAttribute(ds,i);
		i=am[TITLE].markAttribute(ds, i);
		i=am[PAGERANGE].markAttribute(ds, i);
		i=am[PUBLICATION].markAttribute(ds, i);
		i=am[VOLUMEISSUE].markAttribute(ds, i);
		i=am[EXTRA].markAttribute(ds, i);
		i=am[PLACE].markAttribute(ds, i);
		i=am[PUBLISH].markAttribute(ds, i);
		i=am[URL].markAttribute(ds, i);                     // djb
		i=am[POSTPROCESS].markAttribute(ds, i);

	}catch(Exception e){
		Utils.DEBUG("***ERROR IN CITATION PROCESSING ("+e+")***");
		e.printStackTrace();
		}

		String rest=cit_out.output(Output);

		//but wait! there may be a further bit of citation hanging on
		if(rest!=null && rest.length()>12)
			split_multiCitation(rest, pr, wr, Output);

		}


	/**
	 *
	 * split_multiCitation
	 *
	 * If significant citation material is found to be left over with a multiCite
	 * hint in operation, it may be assumed that another citation occurrence has
	 * been found and dodecite may be called recursively.
	 *
	 * @param rest the remaining part of the line containing the citation
	 * under scrutiny
	 * @param pr the page number of the article which contained this citation
	 * @param wr the word number at which this citation started on the page
	 * @param Output the PrintWriter to which all output must be sent
	**/
	protected void split_multiCitation(String rest, String pr, String wr, PrintWriter Output){
		int c, cmax=rest.length();
		boolean seenMDash=false;
		char ch;

		for(c=0; c<cmax && !Character.isLetterOrDigit(ch=rest.charAt(c)); c++)
			if(ch=='\u2014')seenMDash=true;

		rest=rest.substring(c);
		cmax=rest.length();
		if(cmax>12){
			if(ds.multiCiteMDashHint){
				if(!seenMDash){
					int ed=rest.indexOf('\u2014');
					if(ed>0){
						seenMDash=true;
						while(ed<cmax && rest.charAt(ed)=='\u2014')ed++;
						rest=rest.substring(ed);
						}
					}

				if(seenMDash)
					dodecite(ds.firstAuthor+" "+rest,pr, wr, Output);
				}
			else if(ds.multiCiteSharesAuthorHint){
				char ch1=rest.charAt(0);
				char ch2=rest.charAt(1);

				String author="";
				if(ds.authb>=0)author=Utils.detag(ds.line.substring(ds.authb,ds.authe+1).trim());
				if(((ch1=='1' && ch2=='9')||(ch1=='2' && ch2=='0')) && Character.isDigit(rest.charAt(2)) && Character.isDigit(rest.charAt(3)) && !Character.isDigit(rest.charAt(4)))
					dodecite(author+" "+rest,pr, wr, Output);
				}
			}
		}


	/**
	 *
	 * doReadLoop performs a read loop, reading a line from the input,
	 * and processing and printing it to the output.
	 * It handles the simple (unspecified) XML input lines from the
	 * C/PDF handler phase, sticking continuation lines back together to
	 * handle citation split over page boundaries in a PDF file.
	 *
	**/
	protected void doReadLoop(BufferedReader inp, PrintWriter Output) throws IOException {
		String line;
		String pr, wr;
		String wholebib;
		pr=""; wr=""; wholebib="";

		//allow the outputter to emit a header (especially for XML)
		cit_out.pre(Output);

		while((line=inp.readLine())!=null){
			if(line.startsWith("<bibliography>")){
				// ignore this!
				}
			else if(line.startsWith("</bibliography>")){
				if(wholebib.length()!=0){
					dodecite(wholebib,pr,wr,Output);
					wholebib="";
					}
				}
			else if(line.indexOf("<bibitem")>=0 ){
				//first deal with the last whole item
				if(wholebib.length()!=0){
					dodecite(wholebib,pr,wr,Output);
					wholebib="";
					}

				int maxi=line.length();
				int off, off2;
				char ch;
				pr=""; wr="";
				if((off=line.indexOf("page="))>0){
					off+=5;
					ch=line.charAt(off);
					if(ch=='"' || ch=='\'')off++;
					for(off2=off; off2<maxi && Character.isDigit(ch=line.charAt(off2)); off2++);
					pr=line.substring(off,off2);
					}
				if((off=line.indexOf("wordref="))>0){
					off+=8;
					ch=line.charAt(off);
					if(ch=='"' || ch=='\'')off++;
					for(off2=off; off2<maxi && Character.isDigit(ch=line.charAt(off2)); off2++);
					wr=line.substring(off,off2);
					}
				}
			else if(line.indexOf("<bibitem")>=0 ){
				//do nothing
				}
			else if(line.indexOf("contitem>")<0 && line.indexOf("bibitem>")<0)
				wholebib=wholebib+" "+line;
			}

		//allow the outputter to emit a footer (especially for XML)
		cit_out.post(Output);
		}
	

	/**
	 *
	 * doit initialises the citation harvesting process by setting up the debugging
	 * stream, storing the document id, creating an entity encoder if necessary
	 * and calling the readLoop to process all the citations.
	 * @param inp the (de-entitied) input stream containing citations in a
	 *	  primitive XML format
	 * @param id the unique id corresponding to this article
	 * @param outp the (re-entitying) output stream to which the citation entries
	 *	  will be written.
	 * @result the number of citations processed
	 *
	**/
	public int doit(BufferedReader inp, PrintWriter outp) throws IOException{

		doReadLoop(inp, outp);
		outp.flush();
		return ds.nCites;
		}

	}


/**************************************************************************
Citations Model: for a journal, a citation will look like either
    OPTIONAL_NUMBER AUTHORS DATE TITLE PUBLICATION VOLUME ISSUE PAGE-RANGE
or
    OPTIONAL_NUMBER AUTHORS TITLE PUBLICATION VOLUME ISSUE PAGE-RANGE DATE
if its a book then
    OPTIONAL_NUMBER AUTHORS DATE PUBLICATION PUBLISHER LOCATION
or
    OPTIONAL_NUMBER AUTHORS PUBLICATION PUBLISHER LOCATION DATE 

An OPTIONAL_NUMBER can be easily recognised at the start.
A DATE can be easily recognised (? 1[89][0-9][0-9][a-z]? )?
A PAGE-RANGE can be easily recognised  [0-9]+ *<DASH> *[0-9]+
If there is NO dash then a page range is just the start page which is
   the last number before the end (or the date)
VOL & ISS are the two numbers preceding the pag range (iss is optional)
How to differentiaite title from pub and authors from title?
**************************************************************************/
