package uk.ac.soton.harvester;
import java.io.*;
import java.util.StringTokenizer;

/**
 * Utils is a place for miscellaneous utility methods to try to
 * control class bloat!
 *
 **/
public class Utils {
	

	/**
	 *
	 * debugging is a boolean state determining whether debugging messages
	 * are printed or not. There is no concept of debugging levels---it's
	 * all or nothing.
	 *
	**/
	private static boolean debugging=false;


	/**
	 *
	 * setDebugging controls whether DEBUG messages are printed or not.
	 *
	**/
	public static void setDebugging(boolean b){
		debugging=b;
		}

	/**
	 *
	 * dbugWriter is a PrintStream to which all debugging output is sent.
	 * This is so that output can be caught successfully on Windows/DOS systems.
	 *
	**/
	private static PrintStream dbugWriter;

	/**
	 *
	 * DEBUG is a convenience method for producing debugging output.
	 * The first time it is called it opens the debugging output file (called
	 * "deciter.err") if necessary.
	 *
	 * @param s the String to be written to the debugging file (a newline is added).
	 *
	 */
	public static void DEBUG(String s){
		if(debugging){
			if(dbugWriter==null){
				try{
					dbugWriter=new PrintStream(new FileOutputStream(new File("deciter.err")));
					}
				catch(IOException e){dbugWriter=new PrintStream(System.err);}
				}
			dbugWriter.println(s);
			}
		}

	/**
	 *
	 * ee is an entity encoder object which contains the mapping from
	 * (non-)ASCII to ISO-Latin1 entity names. Most of this task should
	 * be performed invisibly by the output PrintWriter, however, the
	 * deciter needs to have explicit control of the coding process because
	 * it needs to emit tags which should not be transformed (i.e.
	 * <tt>&lt;author&gt;</tt> should not appear as
	 * <tt>&amp;lt;author&amp;gt;</tt> ).
	 *
	**/
	public static EntityEncoder ee=new EntityEncoder();

	/**
	 *
	 * PCDATA is a convenience method to access the entity encoder.
	 * It is used to explicitly transform the output of any data string that
	 * may contain non-ASCII characters or less-than, greater-than or
	 * ampersand symbols. These must all be turned into XML entites by the
	 * EntityEncoder object.
	 *
	**/
	public static String PCDATA(String s){
		if(ee==null)return s;
		else return ee.PCDATA(s);
		}

	/**
	 *
	 * iciSWe "ignore case of initial" version of startsWith
	 * used to make "Del " and "del " match. It also expects string
	 * to either end or have a space after it.
	 *
	**/
	public static boolean iciSWe(String s1, String s2){
		//we'll fake it with ignoring the case on evrything
		if(s1.toLowerCase().startsWith(s2.toLowerCase())==false)
			return false;
		
		//now look for "end of word"
		int l1=s1.length(), l2=s2.length();
		if(l1==l2)return true;
		if(l1<l2)return false;
		return Character.isWhitespace(s1.charAt(l2));
		}


	/**
	 *
	 * iciSWp is the same as <tt>iciSWe</tt>
	 * except it looks for punctuation instead of a space.
	 *
	**/
	public static boolean iciSWp(String s1, String s2){
		//we'll fake it with ignoring the case on evrything
		if(s1.toLowerCase().startsWith(s2.toLowerCase())==false)
			return false;
		
		//now look for "end of word"
		int l1=s1.length(), l2=s2.length();
		if(l1==l2)return true;
		if(l1<l2)return false;
		char ch2=s1.charAt(l2);
		return ch2!='.' && !Character.isLetter(s1.charAt(l2));
		}

	/**
	 *
	 * xxxId recognises strings which are XXX citation ids.
	 * This is one of a limited (but growing?) set of archive
	 * names, a slash and a seven digit number of the form YYMMNNN.
	 *
	**/
	public static boolean xxxId(String s){
		int slash=s.indexOf('/');
		if(slash<0)return false;

		if(s.length()-slash<7)return false;
		boolean num=
		Character.isDigit(s.charAt(slash+1)) &&
		Character.isDigit(s.charAt(slash+2)) &&
		Character.isDigit(s.charAt(slash+3)) &&
		Character.isDigit(s.charAt(slash+4)) &&
		Character.isDigit(s.charAt(slash+5)) &&
		Character.isDigit(s.charAt(slash+6)) &&
		Character.isDigit(s.charAt(slash+7))
		;
		if(num==false)return false;
		String word=s.substring(0,slash).toLowerCase();
		if(word.equals("gr-qc"))return true;
		else if(word.equals("nucl-th"))return true;
		else if(word.equals("nucl-ex"))return true;
		else if(word.equals("astro-ph"))return true;
		else if(word.equals("cond-mat"))return true;
		else if(word.equals("quant-ph"))return true;
		else if(word.equals("physics"))return true;
		else if(word.equals("hep-ph"))return true;
		else if(word.equals("hep-th"))return true;
		else if(word.equals("hep-lat"))return true;
		else if(word.equals("hep-ex"))return true;
		else return false;

		}


	/**
	 *
	 * lowerCaseNameComponent recognises those words which start with
	 * a lowercase letter which are in fact parts of names.
	 *
	**/
	public static boolean lowerCaseNameComponent(String s){
		if(iciSWe(s,"and"))return true; //not really a name
		else if(iciSWe(s,"della"))return true; //Della Valle
		else if(iciSWe(s,"del"))return true; //del Sal
		else if(iciSWe(s,"de"))return true; //de la Rue
		else if(iciSWe(s,"di"))return true; //di Capprio
		else if(iciSWp(s,"al"))return true; //al-Fayed
		else if(iciSWe(s,"la"))return true; //de la Rue
		else if(iciSWp(s,"d"))return true; //d'Souza
		else if(iciSWe(s,"von"))return true; //von Trapp
		else if(iciSWe(s,"van"))return true; //van Dyke
		else return false;
		}


	/**
	 *
	 * isDash recognises the characters from all the character sets
	 * which could correspond to a "dash". This is a crucial part of
	 * recognising a page range: a dash with numeric strings adjacent is
	 * easily recognisiable as a page range.
	 *
	**/
	public static boolean isDash(char ch){
		switch(ch){
			case '-':	//just a normal hyphen
			case ']':	//seen in bids ap-aam sample
			case '\u007b':  //TeX CM encoding endash
			case '\u007c':  //TeX CM encoding emdash
			case '\u00b1':  //adobe standard encoding endash
			case '\u00d0':  //adobe standard encoding emdash
					//and mac roman encoding endash
			case '\u00d1':  //mac roman encoding emdash
			case '\u0096':  //Win Ansi encoding endash
			case '\u0097':  //Win Ansi encoding emdash
			case '\u0085':  //PDF doc encoding endash
			case '\u0084':  //PDF doc encoding emdash
			case '\u008a':  //PDF doc encoding minus
			case '\u00ad':  //Win Ansi encoding hyphen
			case '\u2010':	//Unicode hyphen
			case '\u2011':	//Unicode non-breaking hyphen
			case '\u2012':	//Unicode figure dash
			case '\u2013':	//Unicode endash
			case '\u2014':	//Unicode emdash
			case '\u2015':	//Unicode horizontal bar
				return true;
			default:	//are there any others left?
				return false;
			}
		}


	/**
	 *
	 * isInitial checks to see whether the current word
	 * is in fact an inital / a set of initials as opposed
	 * to a surname.
	 *
	**/
	public static boolean isInitial(String s){
		if(s==null)return false;
		int max=s.length()-1;

		/* Why is this here? contradicts next if  ... djb
		if(max<1)return false;
                But it bombs on index out of range when given "," */

		if(max==0 && Character.isUpperCase(s.charAt(0)))return true;
		if(max==0) return false; // djb added this condition
		if(s.charAt(1)=='.')return true; // djb - added cond.

		//You also have initials iff whole word is
		//composed of capitals. Carr LA, Bagge PR.
		//Could be problematic if "CARR, Leslie A."
		//Put limit of THREE INITIALS then!
		boolean yep=true;
		int l=0;
		int ninits=0;
		for(l=0; l<=max; l++){
			char ch=s.charAt(l);
			if(ch=='.')yep=true;
			else if(ch==',')yep=true;
			else if(ch=='-')yep=true;
			else if(Character.isUpperCase(ch)){yep=true; ninits++;}
			else{yep=false; break;}
			}
		if(ninits>3)return false;
		return yep;
		}


	/**
	 *
	 * toInitials turns a set of "forenames" to an appropriate
	 * set of separated, correctly delimited initials.
	 *
	**/
	public static String toInitials(String s){
		if(s==null)return "";

		int max=s.length()-1;

		// Things like "H-J. Surname" should just return "H-J."
		if(max>0 && Character.isUpperCase(s.charAt(0))&&s.charAt(1)=='-')
			return s;

		String res="";
		int c;
		for(c=0; c<=max; c++){
			char ch=s.charAt(c);

			if(Character.isUpperCase(ch)){
				res+=ch+".";
				}
			else if(ch=='.'){
				//ignore explicit full stops
				}
			else if(ch=='-'){
				//every time a hyphen appears, take off the
				//previous full stop
				int rl=res.length();
				if(rl>0)res=res.substring(0,rl-1);
				res=res+ch;
				}
			}
		return res;
		}



	/**
	 *
	 * detag removes tags from an HTML-style string. These tags are in practise
	 * just the font-change tags &lt;b&gt; and &lt;i&gt;. It is used as a 
	 * final stage filter after all the sections have been recognised in the
	 * original string, and just prior to their final output.
	 *
	**/
	public static String detag(String s){
		if(s==null)return null;

		StringBuffer res=new StringBuffer(64);
		int c=0;
		int slen=s.length();
		char ch;

		while(c<slen){
			ch=s.charAt(c);
			if(ch=='<'){
				c++;
				while(c<slen && (ch=s.charAt(c))!='>')c++;
				c++;
				continue;
				}
			res.append(ch);
			c++;
			}

		return res.toString();
		}


	/**
	 *
	 * lowercaseOrHyphen is a utility method that recognises valid characters
	 * (ie [a-z-]) within an XXX eprint article identifier.
	 *
	 * @param s the string containing the character to check
	 * @param the character offset within the string to check
	 *
	**/
	public static boolean lowercaseOrHyphen(String s, int i){
		if(s.length()<=i)return false;
		char ch=s.charAt(i);
		return(Character.isLowerCase(ch) || ch=='-');
		}


	/**
	 *
	 * isProceedings is a utility method that encapsulates a
	 * naive heuristic (oh, alright then, hack) for determining
	 * whether the citation was to a conf/workshop proceedings
	 *
	**/
	public static boolean isProceedings(DeciterState ds){
		if(ds.line.indexOf("roceedings")>0)return true;
		else return false;
		}

	/**
	 *
	 * isBook is a utility method that encapsulates a naive heuristic
	 * (oh, alright then, hack) for determining whether the
	 * citation was to a book/thesis or not.
	 *
	**/
	public static boolean isBook(DeciterState ds){
		if(isProceedings(ds))return false;

		/* djb - this one is BAD for online literature because
		   there are no page numbers in HTML source
		if(!ds.xxxHint && ds.pagb==-1)return true;
		*/

		String lline=ds.line.toLowerCase();
		if(lline.indexOf("edited ")>=0 || lline.indexOf("editor ")>=0)return true;
 		return false;
		}


	/**
	 *
	 * This is just a safe version of substring
	 *
	**/
	public static String substring(String line, int a, int b){
		if(line==null)return null;

		int ll=line.length();

		if(a<0)a=0;
		if(b<0)b=0;
		if(a>ll)a=ll;
		if(b>ll)b=ll;
		if(a>b)a=b;
		return(line.substring(a,b));
		}
}
