package uk.ac.soton.harvester;
/**
 *
 * doAuthors recognises the author sequence in the citations.
 * It does not split into separate authors, that is left for
 * splitAuthors just before the output phase.
 * The purpose of this class is to set authb and authe.
 *
 * doAuthors divides into TWO cases, the trivial Hint1 case
 * (which is now hardly ever used) and the much more
 * complex "Plain" case. After these are called, a simple
 * clean-up is attempted for the situation where the author
 * choice has been too aggressive and possibly eaten into
 * another part of the citation (artcile title or journal name).
 * This is currently mainly used in the case of the XXX eprint
 * citation style, where there are few clues to the boundary between
 * author name and journal name.
 *
 * (djb) added the "whittleDown()" routine to handle cases where
 * the extra length of stuff after the date caused the code to
 * include the title inside the author string.
**/
public class DoAuthors implements AttributeMarker {
    public int markAttribute(DeciterState ds, int i){
        String line=ds.line;
        int maxi=line.length()-1;
        int result;

        if(i>=maxi)return i;

        if(ds.hint_Author1)result=doAuthorsHint1(ds, i);
        else result=doAuthorsPlain(ds, i);

                if(ds.authb<0 || ds.authe<=ds.authb){
            Utils.DEBUG("Bad authors!");
            }
        else {
            Utils.DEBUG("Found authors ->"+line.substring(ds.authb,ds.authe+1)+"<-");
            }
 
        return result;
        }


    /***
     *
     * doAuthorsHint1 controls the relatively simple (and uncommon)
     * situation where a list of authors is written like "Carr LA and JM Cook."
     *
     * @param line the full citation
     * @param i the offset in the citation to start searching from
     * @return the offset at which the next search should start
    **/
    protected int doAuthorsHint1(DeciterState ds, int i){
        String line=ds.line;
        int maxi=line.length()-1;

        ds.authb=i;
        while(i<maxi && line.charAt(i)!='.')i++;
        ds.authe=i-1;
        return i;
        }

    /***
     *
     * doAuthorsPlain controls the gamut of author name list possibilities.
     * This method is one of the most complicated of the class.
     *
     * @param line the full citation
     * @param i the offset in the citation to start searching from
     * @return the offset at which the next search should start
    **/
    protected int doAuthorsPlain(DeciterState ds, int i){
        String line=ds.line;
        int maxi=line.length()-1;

        char oldch, ch;
        int oldi;
        boolean first=true;
        boolean commaednames=false;

        if(i>maxi)return i;

        // djb get to first non-blank and check for '"' which
        // means that we start with a title and no authors
        while ( Character.isWhitespace(ch=line.charAt(i++)));
        i--;
        if ( ch == '"' ) return i;

        Utils.DEBUG("a"+i);
        //if circumstances are favourable, then you
        //just go from where you are til the date
	//(djb) unless date is before us, e.g. in the "number"
        int halfway=(maxi+1)/2;
        if(ds.dateb>i && ds.dateb<halfway){  
        //if(ds.xxxHint && ds.dateb>0 && ds.dateb<halfway){  // djb
            ds.authb=i;
            ds.authe=ds.dateb-1;
            Utils.DEBUG("b");
            ch=line.charAt(ds.authe);
            while(ch=='(' || Character.isWhitespace(ch)){
                ds.authe--;
                ch=line.charAt(ds.authe);
                }
            Utils.DEBUG("A{"+line.substring(ds.authb, ds.authe)+"}@["+ds.authb+"]");
	    // (djb) remove title if necessary
	    ds.authe = whittleDown(line,ds.authb,ds.authe)+1;
	    Utils.DEBUG("w"+ds.authe);
            return ds.authe+1;
            }

        oldch=' ';
        oldi=i;
        if(Character.isUpperCase(ch=line.charAt(i))){
            ds.authb=i;
            Utils.DEBUG("c "+i);
            while(Character.isLetter(ch=line.charAt(i)) ||
                  Character.isWhitespace(ch) ||
                  ch=='-' ||
                  ch=='.' ||
                  ch==',' ||
                  ch=='\'' ||
                  ch=='&' ||
                  ch=='>' ||
                  (int)ch<32 ||
                  (int)ch>126
                  ){

                  if(i>=maxi)break;

                  Utils.DEBUG("d "+i);
                  if(first && Character.isWhitespace(ch)){
                char prech=line.charAt(i-1);
                first=false;
                if(prech==',')commaednames=true;
                }

                  i++;

                  Utils.DEBUG("e "+i);
                  //ignore all tags!
                  if(ch=='<'){
                while(i<maxi-2 && line.charAt(i)!='>')i++;
                i++;
                //ds.dbugWriter.println("Skipped to ->'"+line.substring(i));
                continue;
                }

                  Utils.DEBUG("f "+i);
                  //How do you know when you've found the end of
                  //the list of authors?
                  //Well, either you get a date (which will 
                  //terminate the loop anyway)

                  //OR you get a fullstop which is not at the
                  //end of an initial
                  if(ch=='.' &&
                 !Character.isUpperCase(line.charAt(i-2)) &&
                 !line.substring(i-3,i-1).equalsIgnoreCase("jr")&&
                 !line.substring(i-4,i-1).equalsIgnoreCase("snr")){
                  Utils.DEBUG("ZChecking out at ->"+line.substring(i));
                 break;
                 }

                  //OR you get et al. (djb)
                  if(line.substring(i-1).startsWith("et al") ) {
                 i += 5;
                 if (line.charAt(i) == '.') i++;
                 if (line.charAt(i) == ',') i++;
                 break;
                  }

                  //OR you get an unauthorised lowercase word
                  if(Character.isLowerCase(ch) &&
                 Character.isWhitespace(oldch) &&
                 !Utils.lowerCaseNameComponent(line.substring(i-1)) &&
                 !Utils.xxxId(line.substring(i-1))){
                 i=oldi-1;
                 Utils.DEBUG("UChecking out at ->"+line.substring(i));
                 //if this happens, back up to the previous
                 //position, cos the title will have started
                 //with a capital!
                 break;
                 }

                  //OR if all authornames are commaed thusfar,
                  //you get an uncommed name
                  // (djb) but check for the case where we have one
                  // commaed name followed by a list of uncommaed names
                  if(commaednames && Character.isUpperCase(ch)){
                 int j;
                  Utils.DEBUG("Checking out at ->"+line.substring(i));
                 for(j=i; j<=maxi && !Character.isWhitespace(line.charAt(j)); j++);
                 j--;
                 // (djb) add check mentioned above
                 if((ch=line.charAt(j))!=',' && ch!='.' ) {
                    if ( isFirstName(line,j)) // djb
                       commaednames = false;
                    else {
                       i--;
                       break;
                       }
                    }
                 }

                //OR (physics preprints)
                //you get to the journal title which is
                //two words neither of which are initials
                if(false && ds.xxxHint && Character.isUpperCase(ch) && Character.isWhitespace(oldch)){
                    int j=i+1;
                        Utils.DEBUG("XXX hint at ["+j+"] '");//+line.substring(j,6)+"...'");
                    if(Character.isLowerCase(line.charAt(j))){
                        while(j<line.length() && !Character.isWhitespace(line.charAt(j)))j++;
                        while(j<line.length() && Character.isWhitespace(line.charAt(j)))j++;
                        if(Character.isUpperCase(line.charAt(j)) && Character.isLowerCase(line.charAt(j+1))){
                            Utils.DEBUG("XXX test succeeded");
                            break;
                            }
                        }
                    Utils.DEBUG("XXX test failed");
                    }

                  //OR (special case) the title starts with
                  //TWO apostrophes!
                  if(ch=='\'' && oldch=='\''){
                i-=2;
                Utils.DEBUG("g");
                break;
                }

                  if(Character.isWhitespace(oldch) && Character.isUpperCase(ch))oldi=i;
                  oldch=ch;
                  }          // end while
            ds.authe=i-1;
            }
        Utils.DEBUG("DoAuthors returning i = " + i);
        return i;
        }

    // (djb) return true (and set commednames off) if it appears that
    // the character at position j is just the first name in a list of
    // uncommaed names. 
    // Upon entry, j is the position of the last character preceding
    // the blank that follows what might be the first name the second
    // author in an uncommed list, or it might be the first word of
    // the title.

    private boolean isFirstName ( String line, int j ) {
           int i = j+2;     // first character after the blank
       if ( i >= line.length() ) return false;
       Utils.DEBUG("ilna"+i);

       char ch;

       // First skip over any "von" part of the name
       while ( true ) {
          ch = line.charAt(i); int k=i;
          if ( Character.isLowerCase(ch) ) {
             while ( Character.isLetter(ch)) ch = line.charAt(i++);
         if ( !Utils.lowerCaseNameComponent(line.substring(k,i-1))) 
            break;
         if ( !Character.isWhitespace(line.charAt(i-1))) break;
          } else { if ((line.substring(i)).startsWith("Van ")) 
               i=i+4; else break; }
       }
           Utils.DEBUG("ilnb"+i+" "+line.substring(i,i+10)+"...");

       // Now i points to first character after [von*] and a blank
       // Note that ',' is a lowercase letter
       if (Character.isUpperCase(line.charAt(i))) {
          // scan until you get to the end of the word
          /*
          while (( ch = line.charAt(i++)) != ',' &&
             ch != '.' && 
             Character.isLetter(ch));
          */
          while ( Character.isLetter(line.charAt(i++)));
       } 
       ch = line.charAt(i-1);
       if (ch != '.' && ch != '-' && ch != ',' ) {
              Utils.DEBUG("ilnc"+i+" false");
          return false;
       }
           Utils.DEBUG("ilnd"+i+" true ");
       return true;
    }

    // The objective of the whittleDown() function is to determine
    // where the end of the author string *really* is.  The hypothesis
    // is that the authors end at ds.dateb-1, but in the case of
    // <author string><title><date><more-than-half-the-refString>
    // this will include the title.  Return correct value for ds.authe.

    private int whittleDown ( String line, int beginAuth, int endAuth ) {

       Utils.DEBUG("whittleDown [" + line + "]");

       int i; // location of first full stop in the working string
       String work = line.substring( beginAuth, endAuth );
       String token;   // word preceding a full-stop
       int k=0; // keep track of how much of "work" we lop off

       Utils.DEBUG("work ["+work+"]");

       while ( (i=work.indexOf(".")) != -1 ) {

	  // The author names string does contain a full stop.  Check
	  // to see whether it is an initial, honorific, or really a
	  // full stop.  Set token to the word before the "."

	  int j = ( work.substring(0,i) ).lastIndexOf(" ");

	  if ( j == -1 ) token = work.substring(0,i);
	  else token = work.substring(j+1,i);
	  Utils.DEBUG("token: "+token+", i="+i+",j="+j+",k="+k);

	  if ( j < i-1           // if j=i-1, we have an isolated dot, " ."
	  && !Utils.isInitial(token+".")        // djb 10-30-00
	  && !token.equals("al")
	  && !token.equals("et")
	  && !(token.toLowerCase()).equals("jr") 
	  && !(token.toLowerCase()).equals("sr") 
	  && !(token.toLowerCase()).equals("snr") )
	  return k+i-1;         // new value for ds.authe

	  work = work.substring(i+1);    // look further
	  k = k + i+1;   // length up to and including the "."

       } // end while

       return endAuth;          // no premature full-stop found

    } // ends whittleDown

}
