import java.util.regex.Pattern;
import java.util.regex.Matcher;
import java.util.List;
import java.util.ArrayList;
import java.util.Set;
import java.util.HashSet;
import java.io.BufferedReader;
import java.io.Reader;
import java.io.StringReader;
import java.io.IOException;
/**
* A straightforward scanner for CS 212. Uses regular expressions to define tokens.
*
* Methods defineToken() and defineIgnored() are used to determine what the scanner
* considers to be a token and what should be ignored (usually you want to ignore
* whitespace and comments). Method defineKeywords() lets you declare some tokens
* to be keywords.
*
* Each token is a particular 'kind' of token. A kind is always a Java character.
* Two kinds are predefined: keyword tokens use token-kind 'K'; the EOF token is
* reported as null with token-kind '\0'.
*
* There are methods to report (1) the current token [token()], (2) the current kind
* [kind()], and (3) the current line number [lineNumber()]. The result returned by
* token() is a String, exactly as the token appears in the input file.
*
* Looking at a token (via token()) does not change the current token. Method advance()
* is used to pass over the current token and go on to the next one. An initial call to
* advance() is needed to get to the first token.
*
* The source for tokens is determined by setSource(). It can use either a Reader or
* a String. The String source is useful for debugging.
*
* @author Paul Chew for CS212 (Sep 2007)
*/
public class Scanner212 {
private List tokens; // Token patterns (order is significant)
private List kinds; // Corresponding kinds
private Set keywords; // Keyword tokens
private String ignoringString; // Pattern string of stuff to ignore
private Pattern ignoring; // Pattern of stuff to ignore
private BufferedReader inFile; // Input file
private int lineCount; // Current line number in file
private String line; // Current line
private String token; // Current token (as in input)
private char kind; // Current kind (from defineToken)
/**
*/
public Scanner212 () {
tokens = new ArrayList();
kinds = new ArrayList();
keywords = new HashSet();
ignoringString = "";
ignoring = Pattern.compile(ignoringString);
lineCount = -1;
line = "";
token = "Use scanner.advance()";
kind = '\0';
}
/**
* Specify the file to be used as token source.
* @param file the input file
*/
public void setSource (Reader file) {
inFile = new BufferedReader(file);
}
/**
* Specify a String to use as token source. Useful for testing and debugging.
* @param string the input string
*/
public void setSource (String string) {
inFile = new BufferedReader(new StringReader(string));
}
/**
* Define a new pattern to be ignored.
* Ignored patterns are typically those for white-space and comments.
* @param patternString a pattern string as defined in class Pattern.
*/
public void defineIgnored (String patternString) {
if (ignoringString.length() != 0) ignoringString += "|";
ignoringString += "(" + patternString + ")";
ignoring = Pattern.compile(ignoringString);
}
/**
* Define a new token (of specified kind) using the regular exp in pattern string.
* Note that order is important. For instance, if the pattern for an integer is entered
* before the pattern for a floating point number then a number such as 15.3 will be
* interpreted as "15" followed by ".3". The kinds '\0' and 'K' are used for EOF
* and keywords, respectively.
* @param patternString a pattern string as defined in class Pattern.
* @param kind a character used to indicate this kind of token.
*/
public void defineToken (String patternString, char kind) {
if (kind == '\0' || kind == 'K')
System.err.println("Warning: Kinds '\\0' and 'K' are already in use.");
tokens.add(Pattern.compile(patternString));
kinds.add(kind);
}
/**
* Define keywords. Keywords use the kind 'K'.
* @param keywords an arbitrary number of keywords
*/
public void defineKeywords (String... keywords) {
for (String k: keywords) this.keywords.add(k);
}
/**
* Current line number.
* @return the current line number
*/
public int lineNumber () {
return lineCount;
}
/**
* Report the token's string exactly as it appears in the input.
* @return the current token string
*/
public String token () {
return token;
}
/**
* Report the kind of the curren token.
* @return the kind of the current token
*/
public char kind () {
return kind;
}
/**
* True iff the current token matches the input string.
* @return true iff the current token matches the input string
*/
public boolean matches (String input) {
return token.equals(input);
}
/**
* Integer value of the current token.
* @return the integer value of the current token
* @throws NumberFormatException
*/
public int intValue () throws NumberFormatException {
return Integer.valueOf(token);
}
/**
* Float value of the current token.
* @return the float value of the current token
* @throws NumberFormatException
*/
public float floatValue () throws NumberFormatException {
return Float.valueOf(token);
}
/**
* Advance by one token.
* @throws RuntimeException if the source cannot be read
* @throws IllegalArgumentException if non-token in source (token def is bad)
*/
public void advance () throws RuntimeException, IllegalArgumentException {
while (true) {
// Get a new line of input, if necessary
if (line.length() == 0) {
try {line = inFile.readLine();}
catch (IOException e) {
throw new RuntimeException("Cannot read source file", e);
}
if (line == null) { // EOF reached
token = null;
kind = '\0';
return;
}
lineCount++;
continue;
}
// Check for stuff that is supposed to be ignored
int index = find(ignoring, line);
if (index > 0) {
line = line.substring(index);
continue;
}
// Check for a token
for (int i = 0; i < tokens.size(); i++) {
Pattern pattern = tokens.get(i);
index = find(pattern, line);
if (index == 0) continue;
token = line.substring(0, index);
line = line.substring(index);
if (keywords.contains(token)) kind = 'K';
else kind = kinds.get(i);
return;
}
// Can only reach this part if the token definitions are bad
throw new IllegalArgumentException("No token found: " + line +
"\nConsider adding '.' as a token");
}
}
/**
* Find the given pattern at the front of the text.
* @param pattern the pattern to look for
* @param text the text to look in
* @return the length of the matching portion of the text (0 if no match)
*/
public static int find (Pattern pattern, String text) {
Matcher m = pattern.matcher(text);
if (m.lookingAt()) return m.end();
return 0;
}
/**
* Find the given pattern at the front of the text.
* This version isn't used as part of the scanner, but it's handy to test
* how regular expressions work.
* @param pattern a string built according to the rules in java.util.regex.Pattern
* @param text the text to look in
* @return the length of the matching portion of the text (0 if no match)
*/
public static int find (String pattern, String text) {
return find(Pattern.compile(pattern), text);
}
/**
* Set up a scanner for Bali (CS 212, Fall 2007).
*/
public static Scanner212 baliScanner () {
Scanner212 s = new Scanner212();
s.defineIgnored("\\s+"); // Ignore whitespace
s.defineIgnored("#.*$"); // Ignore comments (which start with #)
s.defineToken("[a-zA-Z][a-zA-Z0-9]*", 'w'); // Word
String exp = "([eE][-+]?[0-9]+)"; // Exponent part of a float
s.defineToken("[0-9]+"+exp, 'f'); // Float with exponent and no decimal
s.defineToken("((\\.[0-9]+)|([0-9]+\\.[0-9]*))"+exp+"?", 'f');
// Float with decimal and optional exp
s.defineToken("[0-9]+", 'i'); // Unsigned integer
s.defineToken("'.'", 'c'); // Character
s.defineToken("(\".*?\")+", 's'); // String ("" used for " within string)
s.defineToken("\".*$", 'X'); // Badly formed string (no close-quote)
s.defineToken("(<=|>=|==|!=|.)", 'd'); // Delimiter (. matches anything)
s.defineKeywords("end", "void",
"class", "extends", "endclass", "this", "super", "null",
"if", "then", "else", "endif",
"loop", "while", "until", "endloop",
"return", "print",
"and", "or", "not", "true", "false");
return s;
}
/**
* Test program.
*/
public static void main (String[] args) {
Scanner212 s = Scanner212.baliScanner();
s.setSource(" hello 3e-5 .3 19.71 -18 0 \n" +
"12345 'x' \"string\", >= < .. if \"another string\" \n" +
" false \"x\"\"y\"# Comment \n" +
"6.02E+23 6x # Another comment\n" +
"\"string with unpaired quote");
for (s.advance(); s.kind() != '\0'; s.advance())
System.out.println(s.lineNumber() + ": " + s.kind() + " " + s.token());
}
}