package uk.ac.soton.harvester;
/**
 *
 * doPageRange recognises the occurrence of the article's page range.
 * It is based around the elaborate <tt>isDash</tt> method which
 * provides a sure-fire mechanism for spotting the dash in the middle of
 * a page range.
 * In the eventuality that no dash is recognised, just try and set the
 * misc[be] variables. Otherwise, try and set miscb, misce, pubb and pube.
 * This gets a bit convoluted and should be separated into individual classes.
 * The 'misc' portion is the bit between the publication name and the
 * page number which should contain the volume and issue details for
 * subsequent extraction.
 *
**/
public class DoPageRange implements AttributeMarker {
	public int markAttribute(DeciterState ds, int i){
		String line=ds.line;
		int maxi=line.length()-1;
		char ch;
		int j;

		//for(j=i; j<maxi; j++){
		//Work backwards iff proceedings. page range is usually last.
		//Conference dates which apprear as part of the
		//publication name e.g. "ACM HT 97 April 16-19"
		//may intrude otherwise.

		int first=i, last=maxi, incr=1;
		if(Utils.isProceedings(ds)){
			first=maxi-1;
			last=i;
			incr=-1;
			}

		for(j=first; (incr==1)?j<last:j>last; j+=incr){
			if(!Utils.isDash(line.charAt(j)))continue;

			//need to allow spaces around the -
			int y=j-1;
			while(Character.isWhitespace(line.charAt(y)))y--;
			int z=j+1;
			while(z<maxi && Character.isWhitespace(line.charAt(z)))z++;

			if(!Character.isDigit(line.charAt(y)) ||
			   !Character.isDigit(line.charAt(z))) continue;

			int k;
			for(k=y; Character.isDigit(line.charAt(k)); k--);
			ds.pagb=k+1;
			for(k=z; k<=maxi && Character.isDigit(line.charAt(k)); k++);
			ds.page=k-1;
			break;
			}

		/* djb - bornDigital works don't have page numbers in which
		   case we want to start at the date and work backwards.
		if(ds.pagb==-1){
		*/
		if(ds.xxxHint && (ds.pagb==-1)){
			//if we didn't find a page range, then,
			//we may just be looking for some numbers!
			if(ds.authe>=0 || ds.titb>=0){
				if(ds.authe>ds.titb)ds.miscb=ds.authe+1;
				else ds.miscb=ds.tite+1;
				if(ds.dateb>ds.miscb)ds.misce=ds.dateb-1;
				else ds.misce=line.length()-1;
				}
			Utils.DEBUG("NOPR! " + ds.miscb + " " + ds.misce+" '"+line.substring(ds.miscb,ds.misce+1)+"'");
			return i;
			}

		//the misc portion is the bit between the publication
		//name and the page number. hopefully it contains
		//volume and issue details

		//start just before the page number and go back
		//until you hit a word.  If we have no page number, start
		//just before the date.  If we have no date, start at end 
		//of line. (djb)
		int d;
		if(ds.pagb==-1){
		   if(ds.dateb==-1)d=ds.misce=line.length()-1;
		   else d=ds.misce=ds.dateb-1;
		} else d=ds.misce=ds.pagb-1;
		Utils.DEBUG("Misc ends "+ds.misce); //djb
		while(true){
			//skip tags
			if(line.charAt(d)=='>'){
				d--;
				while(d>=0 && line.charAt(d)!='<')d--;
				// djb if(d>=0)d--;
				if(d>0)d--;
				}
			Utils.DEBUG("dpr1 " + d);
			// djb while(!Character.isLetter(line.charAt(d))) d--;
			while((d>0) && !Character.isLetter(line.charAt(d))) d--;
			if(d<maxi && line.charAt(d+1)=='>')d++;
			ds.miscb=d+1;

			//what is this word exactly?
			//vol. volume issue iss pages page OR pp
			// djb while(Character.isLetter(line.charAt(d)))d--;
			while(d>0&&Character.isLetter(line.charAt(d)))d--;
			String s=line.substring(d+1,ds.miscb).toLowerCase();
			if(s.startsWith("page")||s.startsWith("vol")||
		   	   s.startsWith("iss"))continue;
			else break;
			}

		//it's possible that the misc has intruded oin the
		//year number. If so, it must be wrong because volume
		//numbers must be < 1900!
		Utils.DEBUG("Misc begins "+ds.miscb); //djb
		if(ds.miscb<=ds.datee && ds.pagb>ds.datee)ds.miscb=ds.datee+1;

		
                Utils.DEBUG("PR="+ds.pagb+","+ds.page);

		return i;
		}
	}

