// bergmark - May 2000 - Reference Linking Project

// DecodeURL is a utility that manages URL grammars for various repositories
// For some repositories, it is possible to obtain a DOI from the URL,
// and sometimes even the URL of a metadata (XML) file for the given URL
// Many URLs also contain some fragment giving the publication date

// This assumes we analyze no more than one repository per run

// Problem to be resolved: journals like those from ACM have metadata
// but it is in plain HTML which would have to be "scraped" to get the
// information.

// To use this class, make these calls in this order:
//    (1) DecodeURL.initialize();                 (once per run)
//    (2) DecodeURL.decode ( some-URL-string );   (once per url)

// Modification:
// 2000-07-28:      Dispense with DIDs.  It is DOIs that we generate
//                  from URLs.
// 2000-07-31:      Make sure date is returned in DC format (yyyy-mm-dd)
// 2001-07-24:      decode should never return null; return domain instead
// 2001-07-24:      Add grammars for decoding ACM URLs

package Linkable.Utility;

import Linkable.API.Creation;
import Linkable.API.SurrogateException;

import java.util.Vector;
import java.util.Hashtable;

public class DecodeURL {

   private static final String ME = "DecodeURL: ";
   private static final boolean DEBUG = CONFIG.DEBUG;

   // Some (fixed) basic grammars:
   // <monthname> is in "january,february,... december"
   // <month> is "01" through "12"
   // <aname> is a string of lower-cased letters, no leading digits, 
   //     no period or slash.  like "moore-pt2" or "birdsey" or "p302-clare"
   // <year> is "00" through "99" with anything before "65" considered to
   //        be in century 2000.
   // <syear> is "01" through "99" or "2000"
   // <oamsyear> is a 4-digit year number
   // <domain> is like "www.dlib.org"
   // <path> is like "/dlib/december99/" or "/dlib/april00/birdsey"
   // <filename> is like "12miller.html"
   // <issue> is like "xrds5-4" or "33-1"
   // <journal> is the short journal name, like "cacm" or a proceedings like "comm"

   private static Hashtable months = new Hashtable();
   private static final String[] monthnames = {
   "january","february","march","april","may","june",
   "july","august","september","october","november", "december"};

   //==============  General URL Grammar ===================

   private static Vector generalUrl = new Vector();
   private static String[] generalUrlTree =
       {"http://", "<domain>", "<path>", "<filename>"};

  //===============  Grammar for ACM Journals ====================
  private static Vector ACM_journal = new Vector();
  private static Vector ACM_magazine = new Vector();
  private static Vector ACM_proceedings = new Vector();
  private static String[] ACM_journal_url = {
     "http://www.acm.org/pubs/articles/journals/",
     "<journal>", "/", "<oamsyear>", "-", "<issue>", "/",
     "<aname>", "/", "<aname>", ".pdf"
  };
  private static String[] ACM_magazine_url = {
     "http://www.acm.org/",  "<journal>", "/", "<issue>", "/",
     "<aname>", ".html"
  };
  private static String[] ACM_proceedings_url = {
     "http://www.acm.org/pubs/articles/proceedings/",
     "<journal>", "/", "<issue>", "/", "<aname>", "/", "<aname>", ".pdf"
  };
  private static String[] ACM_DOI = {
     "ACM/", "<journal>", "/", "<issue>", "-", "<aname>"
  };
  private static String[] ACM_pubDate = { "<oamsyear>" };

   //==============  Grammar for D-Lib Magazine ===================
   private static Vector D_Lib = new Vector();
   private static String[] urlTree = {
      "http://www.dlib.org/dlib/", "<monthname>", "<year>", "/",
      "<path>","<month>", "<aname>", ".html"
   };
   private static String[] DOI = {
      "10.1045/","<monthname>","<syear>","-","<aname>"
   };
   private static String[] xmlTree = {
      "http://www.dlib.org/dlib/","<monthname>","<year>","/",
      "<path>","<month>","<aname>", ".meta.xml"
   };
   private static String[] pubDate = {
      "<oamsyear>","-","<month>","-01"
   };

   // ========  Public Methods ====================================

   // Given a url, returns a URN and optionally an XML URL and
   //  optionally a publication date
   // Returned result: element 1 is a DOI
   // element 2 is the URL of the XML file, if it exists
   // Element 3 is a publication date in some format
   // Exception if DOI cannot be parsed
   public static Vector decode ( String url ) 
   throws SurrogateException {
      if ( DEBUG )
      System.err.println(ME+"is decoding " + url );
      if ( ! isInitialized ) initialize();

      // analyze the url to determine which grammar to apply
      // (1) get the domain and the pathname
      String domain,pathname;
      if ( ! url.startsWith ( "http://", 0 ) )
	 throw new SurrogateException (
	 ME+"only knows how to decompose http urls");

      int i = url.indexOf ('/',7);  // first "/" after "http://"
      if ( i == -1 ) {
	 domain = url.substring(7); pathname="";
      }
      else {
         domain = url.substring(7, i );
	 pathname = url.substring ( i, url.lastIndexOf  ( '/')+1 );
      }
      if ( DEBUG ) 
      System.err.println(ME+"domain= "+domain+", pathname="+pathname);

      // we don't need to synthesize a DOI here - it will get done
      // later when this Work gets stashed into the Creation database
      // However, if we have the DOI on hand, we should get it now
      if ( pathname.startsWith ("/dlib") ) {
	 return decode2 (D_Lib,url);
      }
      // If not dlib, try ACM, which currently has one of three forms
      if ( domain.equals("www.acm.org") ) {
	 Vector result = null;
	 if ( (result = decode2 (ACM_proceedings, url)) != null )
	    return result;
	 else return (
	    (result = decode2 (ACM_journal, url)) == null?
	    decode2 (ACM_magazine, url) : result
	 );
      }

      // Do not return a null result; at least return the domain name
      Vector v = new Vector();
      v.add ( domain ); v.add(null); v.add(null);
      return v;
   }

   // ============  PRIVATE ROUTINES ======================
   private static boolean isInitialized = false;
   private static String monthname,month,aname,year,syear,path,oamsyear;
   private static String journal,issue;

   // statements which initialize the structures
   private static void initialize () {
      months.put("january", "01");
      months.put("february", "02");
      months.put("march", "03");
      months.put("april", "04");
      months.put("may", "05");
      months.put("june", "06");
      months.put("july", "07");
      months.put("august", "08");
      months.put("september", "09");
      months.put("october", "10");
      months.put("november", "11");
      months.put("december", "12");
      generalUrl.addElement ( generalUrlTree );
      ACM_journal.addElement(ACM_journal_url);
      ACM_journal.addElement(ACM_DOI);
      ACM_journal.addElement(null);              // no metadata
      ACM_journal.addElement(ACM_pubDate);
      ACM_magazine.addElement(ACM_magazine_url);
      ACM_magazine.addElement(ACM_DOI);
      ACM_proceedings.addElement(ACM_proceedings_url);
      ACM_proceedings.addElement(ACM_DOI);
      D_Lib.addElement ( urlTree );
      D_Lib.addElement (DOI);
      D_Lib.addElement (xmlTree);
      D_Lib.addElement (pubDate);
      isInitialized = true;
      if ( DEBUG )
      System.err.println(ME+"has been initialized.");
   }

   // decode2 -
   // The grammars have been initialized.  Apply the grammar ( that
   // is in the first element of the provided vector) to the URL.
   // The DOI is returned as the first element of the result vector.
   // The DOI grammar is the second element of the input vector; 
   // Assert: it is not possible to return a null DOI.
   // Throws SurrogateException if the url cannot be decoded (including
   // the reason why
   private static Vector decode2 ( Vector v, String url ) 
   throws SurrogateException {
       Vector result = new Vector();
       String s;
       if ( DEBUG )
       System.err.println(ME+"in decode2");
       String[] grammar = (String[])v.elementAt(0);
       if ( DEBUG )
       System.err.println(ME+"parsing with grammar of " + grammar.length
       + " elements");
       int pos=0;  // position in url where we are scanning
       int k;      // temporary variable
       for (int i = 0; i < grammar.length; i++ ) {
	  s = grammar[i];     // either a "<name>" or a string
	  if ( DEBUG )
	  System.err.println(ME+"parsing element " + i + ": " + s
	  + " Remaining url: " + url.substring(pos) );
	  if ( s.startsWith("<") ) {
	     // we have the name of a non-terminal.  
	     // Initialize proper variable with the constant.
             if ( s.equals("<monthname>") ) {
                int j;
                for ( j=0; j<12; j++ )
	           if ( url.startsWith ( monthnames[j], pos ) ) break;
                if (j > 12) errorExit("monthname expected at pos " , pos);
                monthname = monthnames[j];
		if ( DEBUG )
                System.err.println(ME+"monthname set to " + monthname);
                pos += monthnames[j].length();
             } else if (s.equals("<month>") ) {
		// TBD check for "01" through "12"
                month = url.substring(pos,pos+2);     
		if (DEBUG)
                System.err.println(ME+"month set to " + month);
                pos+=2;
             } else if (s.equals("<aname>") ) {
                // TBD check for all lowercase, no punc.
                // Determine where aname ends by peeking at next string
                k = url.indexOf ( grammar[i+1], pos );
                aname = url.substring(pos,k);     
		if (DEBUG)
                System.err.println(ME+"aname set to " + aname);
                pos+=aname.length();
             } else if (s.equals("<year>") ) {
                // TBD check for "00" thru "99" 
                year = url.substring(pos,pos+2);
		if (DEBUG)
                System.err.println(ME+"year set to " + year);
                pos+=2;
		// TBD make this into an algorithm that works for "2001" etc.
		if ( year.equals ( "00" ) ) 
		   {syear = "2000"; oamsyear = "2000"; }
                else {syear = year; oamsyear="19"+year;}
		if (DEBUG)
                System.err.println(ME+"syear set to " + syear);
		if (DEBUG)
                System.err.println(ME+"oamsyear set to " + oamsyear);
	     } else if (s.equals("<path>") ) {
		// suck up everything through the final '/'
		if ( (k = url.lastIndexOf("/") ) != -1 ) {
		   path = url.substring(pos,k+1);
		   pos = k+1;
		} else path = "";
		if (DEBUG)
                System.err.println(ME+"path set to " + path);
	     } else if (s.equals("<oamsyear>")  && pos+3 <url.length() &&
	       Character.isDigit(url.charAt(pos) ) &&
	       Character.isDigit(url.charAt(pos+1))  &&
	       Character.isDigit(url.charAt(pos+2) ) &&
	       Character.isDigit(url.charAt(pos+3) ))  {
		oamsyear = url.substring(pos,pos+4);
		pos+=4;
		if (DEBUG) System.err.println(ME+"oamsyear set to " + oamsyear);
	     } else if (s.equals("<journal>") ) {   
		k = url.indexOf ( grammar[i+1], pos );
		journal = url.substring(pos,k);
		if ( DEBUG )
		System.err.println(ME+"journal set to " + journal);
		pos+=journal.length();
	     } else if (s.equals("<issue>") ) {   
		k = url.indexOf ( grammar[i+1], pos );
		issue = url.substring(pos,k);
		if ( DEBUG )
		System.err.println(ME+"issue set to " + issue);
		pos+=issue.length();
             } else errorExit2("invalid grammar.."+s+" unknown");
          } else {  // s is a constant.  
	     if ( ! url.startsWith ( s, pos) ) {
		if ( DEBUG )
                System.err.println(ME+"serious problem in decode2..." 
                + " appying wrong grammar?");
		if ( DEBUG )
                System.err.println(ME+"decode2: url should have "
                + s + " at position " + pos + " but does not.");
                return null;
	     }
             pos += s.length();
          }  // handled constant
	  }  // have parsed a chunk of the url
       // The "global" variables now have values filled in. 
       // These values can be used to generate DOI's and XML URL's.
       if ( DEBUG )
       System.err.println(ME+"parse complete.  v.size = " + v.size());

       if (v.size() > 1) {    // parse out a DOI 
	  result.addElement ( 
	     parseDOI( (String[])v.elementAt(1) ) );
	  if (DEBUG) System.err.println(ME+"added a DOI to the result: "
	     + (String)result.lastElement());
       } else { result.addElement ( null ); }

       if (v.size() > 2 && v.elementAt(2)!=null) {   
	  // parse out URL of the XML file
	  result.addElement ( parseDOI((String[])v.elementAt(2)));
       } else result.addElement ( null ); 

       if (v.size() > 3) {   // parse out date contained in the URL
	  result.addElement ( parseDOI((String[])v.elementAt(3)));
       }

       return result;
   }                                      // decode2

   // Generates strings based on contents of global variables, as
   // determined by parsing the url provided by the caller.  You
   // can generate a DOI, or the URL of an XML file, for example.
   // Throws SurrogateException if the DOI cannot be parsed
   private static String parseDOI ( String[] DOIgrammar ) 
   throws SurrogateException {
	  String nonTerminal="";
	  String doi="";
	  if ( DEBUG )
	  System.err.println(ME+"in parseDOI with a DOI grammar = "
	     + DOIgrammar);
	  for (int i=0; i<DOIgrammar.length; i++ ) {
	     if ( DOIgrammar[i].startsWith ("<") ) {
		int j = DOIgrammar[i].indexOf(">");
		if ( j == -1 ) 
		   errorExit2("invalid grammar element " + DOIgrammar[i]);
		nonTerminal = DOIgrammar[i].substring(0,j+1);
		if ( DEBUG )
		System.err.println(ME+"generating element " + nonTerminal );
	        if ( nonTerminal.equals("<monthname>") ) {
		   doi = doi + monthname;
	        } else if (nonTerminal.equals("<month>") ) {
		   doi = doi + month;
	        } else if (nonTerminal.equals("<aname>") ) {
		   doi = doi + aname;
	        } else if (nonTerminal.equals("<year>") ) {
		   doi = doi + year;
	        } else if (nonTerminal.equals("<syear>") ) {
		   doi = doi + syear;
		} else if (nonTerminal.equals("<oamsyear>") ) {
		   doi = doi + oamsyear;
		} else if (nonTerminal.equals("<aname>") ) {
		   doi += aname;
		} else if (nonTerminal.equals("<path>") ) {
		   doi += path;
		} else if (nonTerminal.equals("<issue>") ) {
		   doi += issue;
		} else if (nonTerminal.equals("<journal>") ) {
		   doi += journal;
		} else errorExit2 ("grammar element unknown - " + nonTerminal);
	     } else {         // append the string whose name is nonTerminal
	        doi += DOIgrammar[i];
             }
	  }
	  if ( DEBUG )
	  System.err.println(ME+"generated string = " + doi);
	  return doi;
   }

   private static void errorExit ( String s, int pos ) 
   throws SurrogateException {
      throw new SurrogateException (
      SurrogateException.setInternal ( ME+s+pos));
   }

   private static void errorExit2 ( String s ) 
   throws SurrogateException {
      throw new SurrogateException (
      SurrogateException.setInternal ( ME+s ));
   }

}

