Linkable.Analysis
Class XHTMLAnalyzer

java.lang.Object
  |
  +--org.xml.sax.HandlerBase
        |
        +--Linkable.Analysis.XHTMLAnalyzer
All Implemented Interfaces:
org.xml.sax.DocumentHandler, org.xml.sax.DTDHandler, org.xml.sax.EntityResolver, org.xml.sax.ErrorHandler, RefLinkAnalyzer

public class XHTMLAnalyzer
extends org.xml.sax.HandlerBase
implements RefLinkAnalyzer


Field Summary
private  AuthorSection as
           
private  Author[] authors
           
private  java.util.Vector contextTrees
           
private  ContextSection cs
           
private static boolean DEBUG
           
private  java.util.Vector displayID
           
private  boolean doContexts
           
private  boolean doingReferences
           
private  javax.xml.parsers.SAXParserFactory factory
           
private  boolean finishing
           
private  boolean firstReference
           
private  boolean getDocTitle
           
private  boolean grabAuthor
           
private  boolean grabReference
           
private  java.util.Vector knownCitations
           
private  java.lang.String localURL
           
private  Creation me
           
private static java.lang.String ME
           
private  java.lang.String moreName
           
private  boolean moreTitle
           
private  boolean notInTable
           
private  java.lang.String pubDate
           
private  ReferenceSection rs
           
private  SentenceTree sentence
           
private  java.lang.String sourceURN
           
private  boolean startAuthor
           
private  boolean starting
           
private  java.lang.String startName
           
private  boolean startTitle
           
private  java.lang.String title
           
 
Constructor Summary
XHTMLAnalyzer()
          Default constructor creates a plain XHTMLAnalyzer.
 
Method Summary
private  java.io.BufferedReader buffer(java.io.InputStreamReader in, int k)
           
 java.util.Vector buildCitationList(java.lang.String docURN)
          buildCitationList - Return a vector of Citation objects currently known for this item This will involved calls on the citeref database, which is indexed by document URN.
 java.lang.String buildLocalMetaData(java.lang.String doi, java.lang.String pubDateIn, Creation c)
          Return an XML file that contains original text fragments of bibliographic information gleaned from this archive item.
 Reference[] buildRefList(BibData b)
          buildRefList - Return an array of Reference objects gleaned from this archive item.
 void characters(char[] buf, int offset, int len)
          Implements the characters interface of the DocumentHandler
 void endDocument()
          Implements the endDocument interface of the DocumentHandler
 void endElement(java.lang.String name)
           
private  int[] findLocalLink(java.lang.String doc, int p, java.lang.String ref)
           
private  int fullStop(java.lang.String s, int n, char c)
           
 java.lang.String getDate()
           
 java.lang.String getLinkedText(Reference[] refList, java.lang.String url)
          getLinkedText emits XML for the linked body of the text.
 java.lang.String getLinkedTextFinalize()
          getLinkedTextFinalize emits XML for finishing off the Surrogate linked text output.
 java.lang.String getLinkedTextInitialize()
          getLinkedTextInitialize sets up to generate XML for our Surrogate, but not the incantation.
protected  void handleEndTag(java.lang.String tag)
           
protected  void handleStartTag(java.lang.String name, org.xml.sax.AttributeList attrs)
           
protected  void handleText(char[] text, int offset, int length)
           
private  boolean isAnH(java.lang.String h)
           
private  boolean isSizeChange(org.xml.sax.AttributeList attrs)
           
private  int nextEOS(java.lang.String s, int n)
          finds the end of the sentence.
 java.io.InputStreamReader openConn(java.net.URLConnection conn)
           
private  java.net.URLConnection openURL(java.lang.String url)
           
private  int processText(java.lang.String s, int nextPos)
          splits text into sentences.
private  java.lang.String readInputStream(java.lang.String u)
           
private  java.lang.String refHasURL(java.lang.String[] xLinks, java.lang.String[] tags, java.lang.String ref)
          returns XLink elements or null for each Reference in the list note that XLink elements may contain multiple URLs They each contain "****" where the anchor (the reference in text) is supposed to go.
private  java.lang.String resolveTitles(java.lang.String title, java.lang.String textString)
           
 void setURL(java.lang.String url, java.lang.String remoteURL)
          gets the URL of the Item to be analyzed, and proceeds to fill up local structures, partially cooked in some cases, the contents of which can be returned on demand by the Surrogate constructor.
 void startDocument()
          Implements the startDocument interface of the DocumentHandler
 void startElement(java.lang.String name, org.xml.sax.AttributeList attrs)
          Implements the startDocument interface of the DocumentHandler
private  java.lang.String update(java.lang.String document, int position, java.lang.String[] tags, java.lang.String[] xLinks, SentenceTree sentence)
           
 
Methods inherited from class org.xml.sax.HandlerBase
error, fatalError, ignorableWhitespace, notationDecl, processingInstruction, resolveEntity, setDocumentLocator, unparsedEntityDecl, warning
 
Methods inherited from class java.lang.Object
, clone, equals, finalize, getClass, hashCode, notify, notifyAll, registerNatives, toString, wait, wait, wait
 

Field Detail

ME

private static final java.lang.String ME

DEBUG

private static final boolean DEBUG

title

private java.lang.String title

pubDate

private java.lang.String pubDate

displayID

private java.util.Vector displayID

authors

private Author[] authors

rs

private ReferenceSection rs

as

private AuthorSection as

cs

private ContextSection cs

knownCitations

private java.util.Vector knownCitations

me

private Creation me

localURL

private java.lang.String localURL

sourceURN

private java.lang.String sourceURN

factory

private javax.xml.parsers.SAXParserFactory factory

contextTrees

private java.util.Vector contextTrees

sentence

private SentenceTree sentence

starting

private boolean starting

getDocTitle

private boolean getDocTitle

startTitle

private boolean startTitle

moreTitle

private boolean moreTitle

startAuthor

private boolean startAuthor

grabAuthor

private boolean grabAuthor

doContexts

private boolean doContexts

doingReferences

private boolean doingReferences

grabReference

private boolean grabReference

finishing

private boolean finishing

firstReference

private boolean firstReference

startName

private java.lang.String startName

moreName

private java.lang.String moreName

notInTable

private boolean notInTable
Constructor Detail

XHTMLAnalyzer

public XHTMLAnalyzer()
Default constructor creates a plain XHTMLAnalyzer. It will not actually perform any analysis until its setURL method is invoked.
Method Detail

setURL

public void setURL(java.lang.String url,
                   java.lang.String remoteURL)
            throws SurrogateException
gets the URL of the Item to be analyzed, and proceeds to fill up local structures, partially cooked in some cases, the contents of which can be returned on demand by the Surrogate constructor. An alternative to this approach would be to have lots of "set" methods in the Surrogate object which we would set as we analyze. These set methods would have to be public, though, which is not so cool, since they are not part of the Surrogate API as we have defined it. Anyway, setURL opens a connection and then starts up the parser.
Parameters:
url - is a string of the xhtml item to be analyzed
remoteURL - is the location on the net of the original item
Throws:
SurrogateException - if the url cannot be opened for analysis.

startDocument

public void startDocument()
                   throws org.xml.sax.SAXException
Implements the startDocument interface of the DocumentHandler
Overrides:
startDocument in class org.xml.sax.HandlerBase

endDocument

public void endDocument()
                 throws org.xml.sax.SAXException
Implements the endDocument interface of the DocumentHandler
Overrides:
endDocument in class org.xml.sax.HandlerBase

startElement

public void startElement(java.lang.String name,
                         org.xml.sax.AttributeList attrs)
                  throws org.xml.sax.SAXException
Implements the startDocument interface of the DocumentHandler
Overrides:
startElement in class org.xml.sax.HandlerBase

endElement

public void endElement(java.lang.String name)
                throws org.xml.sax.SAXException
Overrides:
endElement in class org.xml.sax.HandlerBase

characters

public void characters(char[] buf,
                       int offset,
                       int len)
                throws org.xml.sax.SAXException
Implements the characters interface of the DocumentHandler
Overrides:
characters in class org.xml.sax.HandlerBase

handleStartTag

protected void handleStartTag(java.lang.String name,
                              org.xml.sax.AttributeList attrs)

handleEndTag

protected void handleEndTag(java.lang.String tag)

handleText

protected void handleText(char[] text,
                          int offset,
                          int length)

processText

private int processText(java.lang.String s,
                        int nextPos)
splits text into sentences.
Parameters:
String - s is the chunk of text that is being processed.
nextPos - (0-based) is where to pick up looking for the end of the current sentence.

nextEOS

private int nextEOS(java.lang.String s,
                    int n)
finds the end of the sentence.
Parameters:
String - s is the hunk of text currently being scanned.
integer - n, position at which to start scanning. Starting at String s, position n, return where the next sentence starts, or -1 if you run off the end of the string while looking for end of sentence, or s.length() if s ends with a sentence.

fullStop

private int fullStop(java.lang.String s,
                     int n,
                     char c)

buildLocalMetaData

public java.lang.String buildLocalMetaData(java.lang.String doi,
                                           java.lang.String pubDateIn,
                                           Creation c)
Return an XML file that contains original text fragments of bibliographic information gleaned from this archive item. As a side effect, since we now have the needed information and since the XML file needs it anyway, synthesize a URN for this item. Stick in own pubDate if one is not provided.
Specified by:
buildLocalMetaData in interface RefLinkAnalyzer

buildRefList

public Reference[] buildRefList(BibData b)
buildRefList - Return an array of Reference objects gleaned from this archive item. As a side effect, also update the CiteRef database No array element should be null, because it should have at least the reference string.
Specified by:
buildRefList in interface RefLinkAnalyzer

buildCitationList

public java.util.Vector buildCitationList(java.lang.String docURN)
buildCitationList - Return a vector of Citation objects currently known for this item This will involved calls on the citeref database, which is indexed by document URN.
Specified by:
buildCitationList in interface RefLinkAnalyzer

getLinkedTextInitialize

public java.lang.String getLinkedTextInitialize()
getLinkedTextInitialize sets up to generate XML for our Surrogate, but not the incantation.
Specified by:
getLinkedTextInitialize in interface RefLinkAnalyzer

getLinkedText

public java.lang.String getLinkedText(Reference[] refList,
                                      java.lang.String url)
                               throws SurrogateException
getLinkedText emits XML for the linked body of the text.
Specified by:
getLinkedText in interface RefLinkAnalyzer
Parameters:
The - array of Reference objects belonging to this Surrogate.
URL - of the item being analyzed, for Base URL address
Throws:
SurrogateException - if URL to be analyzed cannot be opened.

getLinkedTextFinalize

public java.lang.String getLinkedTextFinalize()
getLinkedTextFinalize emits XML for finishing off the Surrogate linked text output. The main use for this routine is to emit the linkage data elements for documents that are not expressed in HTML or in XHTML.
Specified by:
getLinkedTextFinalize in interface RefLinkAnalyzer

openURL

private java.net.URLConnection openURL(java.lang.String url)
                                throws SurrogateException

getDate

public java.lang.String getDate()
Specified by:
getDate in interface RefLinkAnalyzer

openConn

public java.io.InputStreamReader openConn(java.net.URLConnection conn)
                                   throws SurrogateException

buffer

private java.io.BufferedReader buffer(java.io.InputStreamReader in,
                                      int k)

readInputStream

private java.lang.String readInputStream(java.lang.String u)
                                  throws SurrogateException

refHasURL

private java.lang.String refHasURL(java.lang.String[] xLinks,
                                   java.lang.String[] tags,
                                   java.lang.String ref)
returns XLink elements or null for each Reference in the list note that XLink elements may contain multiple URLs They each contain "****" where the anchor (the reference in text) is supposed to go.
Parameters:
The - array of References to be process

update

private java.lang.String update(java.lang.String document,
                                int position,
                                java.lang.String[] tags,
                                java.lang.String[] xLinks,
                                SentenceTree sentence)

findLocalLink

private int[] findLocalLink(java.lang.String doc,
                            int p,
                            java.lang.String ref)

isAnH

private boolean isAnH(java.lang.String h)

isSizeChange

private boolean isSizeChange(org.xml.sax.AttributeList attrs)

resolveTitles

private java.lang.String resolveTitles(java.lang.String title,
                                       java.lang.String textString)