package uk.ac.soton.harvester;
/**
 *
 * doLink recognises an http URL that is included in the reference
 * string.  The '<' are optional.  So here is what is recognized:
 * "<http://" address ">"  |  "http://" address  - djb
 *
**/

// Updates:
// 2000-10-12:  Turns out that DoPageRange can reset miscb-misce after
//              the URL.  So use tite and misce instead of i.
// 2000-10-18:  Add digits to urlLetter as long as it isn't the first
//              one after the slash
// 2000-10-31:  Sometimes part of the URL is included in the title.
//              Add "check" to check for this.  Also search for http
//              anywhere in the line.
// 2000-11-02:  When url ends the line, don't drop the final character
// 2000-11-03:  Since spaces and brackets are so unreliable (the SAX
//              parser in XHTMLAnalyzer seems to regularly swallow the
//              final '>' on a URL, accept urls with unbalanced brackets

public class DoLink implements AttributeMarker {
	public DoLink(){}

	public int markAttribute(DeciterState ds, int i){

                //Utils.DEBUG("DoLink scanning link at " + i);
                Utils.DEBUG("DoLink scanning link at " + ds.tite);
		if ( ds.tite < 0 ) return i;
		//if (i < 0) return i;

		String line=ds.line;
		int maxi=line.length()-1;
		char ch=' ';
		//int j = line.indexOf ("http://",ds.tite);
		int j = line.indexOf ("http://");
		if ( j == -1 ) return i;
		boolean brackets = false;

		if ( line.charAt(j-1) == '<' ) brackets = true;
		ds.urlb = j;
                Utils.DEBUG("DoLink: url begins at " + j
		       +" and brackets="+brackets);
                Utils.DEBUG( line.substring(ds.urlb) );

		if (brackets) {

		   j = line.indexOf(">",j);
		   if ( j == -1 ) {
		      Utils.DEBUG("DoLink finds url with missing '>'");
		      /*
		      ds.urlb = -1;
		      return i;
		      */
		      brackets = false;
		   } 
		 }
		   
                 if ( brackets ) {
	            ds.urle = j-1;
	            return check(ds, j);

		 } else {

		   i = line.lastIndexOf ('/',line.length());
	           Utils.DEBUG("DoLink: path ends at " + i );
		   i++;  // advance to filename (if any)

		   while ( i <= maxi && (isUrlLetter(ch=line.charAt(i))))i++;

		   ds.urle = i-1;
		   if ( line.charAt(ds.urle) == '.' ) ds.urle--;
		   return check(ds, i);

		}
	}

	private boolean isUrlLetter (char ch) {
	   if ( ch == ' ' ) return false;
	   if ( Character.isLetter(ch)) return true;
	   if ( Character.isDigit(ch)) return true;
	   if ( ch == '.' || ch == '-' || ch == '_' 
	     || ch == '%' || ch == '#' ) return true;
	   return false;
        }

	// Currently, ds.urle = i-1.  Compare urlb, urle with
	// titb, tite.  Normal case: ds.tite< ds.urlb || ds.urle<ds.titb
	private int check ( DeciterState ds, int i ) {
	   Utils.DEBUG ("DoLink checking title range = " + ds.titb
	   + "," + ds.tite + " against url range = " + ds.urlb
	   + "," + ds.urle);
	   if ( ds.urlb < ds.titb && ds.tite < ds.urle ) {
	      ds.titb=ds.tite=-1;
	      Utils.DEBUG("Title is contained in url, reset to null");
	   }
	   if ( ds.titb < ds.urlb && ds.urlb < ds.tite ) {
	      ds.tite = ds.urlb-1;
	      Utils.DEBUG("Title overlaps url, reset to before URL");
	   }
	   if ( ds.urlb < ds.titb && ds.urle < ds.tite ) {
	      ds.titb = ds.urle+1; 
	      Utils.DEBUG("Title overlaps url, reset to after URL");
	   }
           // check for empty title.  Swap with authors if they are not empty.
           if ( (ds.tite-ds.titb) < 3 && ds.authe-ds.authb > 3 ) {
	      ds.titb=ds.authb; 
	      ds.tite=ds.authe;
	      ds.authb=ds.authe=-1;
	      Utils.DEBUG("Resulting title is empty, swap with author");
	   }
	   return i;
	}
}
